diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 89494bfd2a..8518fb56c4 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -116,6 +116,7 @@ struct Intrinsic { #[serde(rename = "return")] return_: Return, name: String, + tech: String, #[serde(rename = "CPUID", default)] cpuid: Vec, #[serde(rename = "parameter", default)] @@ -155,7 +156,7 @@ fn verify_all_signatures() { // https://software.intel.com/sites/landingpage/IntrinsicsGuide/# // // Open up the network console and you'll see an xml file was downloaded - // (currently called data-3.4.xml). That's the file we downloaded + // (currently called data-3.6.xml). That's the file we downloaded // here. let xml = include_bytes!("../x86-intel.xml"); @@ -381,17 +382,14 @@ fn verify_all_signatures() { continue; } - // we'll get to avx-512 later - // let avx512 = intel.iter().any(|i| { - // i.name.starts_with("_mm512") || i.cpuid.iter().any(|c| { - // c.contains("512") - // }) - // }); - // if avx512 { - // continue - // } - for intel in intel { + // ignore intrinsics from Intel's Short Vector Math Library (SVML) + // these don't map directly to single instructions but correspond + // to optimized functions implemented in that library + if intel.tech == "SVML" { + continue; + } + missing .entry(&intel.cpuid) .or_insert_with(Vec::new) @@ -406,8 +404,7 @@ fn verify_all_signatures() { println!("\n
{:?}

\n", k); for intel in v { let url = format!( - "https://software.intel.com/sites/landingpage\ - /IntrinsicsGuide/#text={}&expand=5236", + "https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text={}", intel.name ); println!(" * [ ] [`{}`]({})", intel.name, url); diff --git a/crates/stdarch-verify/x86-intel.xml b/crates/stdarch-verify/x86-intel.xml index 264ecee0e6..01be6bdc93 100644 --- a/crates/stdarch-verify/x86-intel.xml +++ b/crates/stdarch-verify/x86-intel.xml @@ -1,14 +1,11 @@ - - - Integer - Flag - ADX - Arithmetic - - - - - + + + + + + + + Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). tmp[32:0] := a[31:0] + b[31:0] + (c_in > 0 ? 1 : 0) @@ -16,20 +13,18 @@ MEM[out+31:out] := tmp[31:0] dst[0] := tmp[32] dst[7:1] := 0 - - -

immintrin.h
- - - Integer - Flag + + ADX +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). tmp[64:0] := a[63:0] + b[63:0] + (c_in > 0 ? 1 : 0) @@ -37,91 +32,85 @@ MEM[out+63:out] := tmp[63:0] dst[0] := tmp[64] dst[7:1] := 0 - - + + + ADX
immintrin.h
-
- - Integer - AES - Cryptography - - - + Arithmetic + + + + + Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." a[127:0] := ShiftRows(a[127:0]) a[127:0] := SubBytes(a[127:0]) a[127:0] := MixColumns(a[127:0]) dst[127:0] := a[127:0] XOR RoundKey[127:0] - -
wmmintrin.h
-
- - Integer + AES +
wmmintrin.h
Cryptography - - - +
+ + + + Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." a[127:0] := ShiftRows(a[127:0]) a[127:0] := SubBytes(a[127:0]) dst[127:0] := a[127:0] XOR RoundKey[127:0] - -
wmmintrin.h
-
- - Integer + AES +
wmmintrin.h
Cryptography - - - +
+ + + + Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". a[127:0] := InvShiftRows(a[127:0]) a[127:0] := InvSubBytes(a[127:0]) a[127:0] := InvMixColumns(a[127:0]) dst[127:0] := a[127:0] XOR RoundKey[127:0] - -
wmmintrin.h
-
- - Integer + AES +
wmmintrin.h
Cryptography - - - +
+ + + + Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". a[127:0] := InvShiftRows(a[127:0]) a[127:0] := InvSubBytes(a[127:0]) dst[127:0] := a[127:0] XOR RoundKey[127:0] - -
wmmintrin.h
-
- - Integer + AES +
wmmintrin.h
Cryptography - - +
+ + + Perform the InvMixColumns transformation on "a" and store the result in "dst". dst[127:0] := InvMixColumns(a[127:0]) - -
wmmintrin.h
-
- - Integer + AES +
wmmintrin.h
Cryptography - - - +
+ + + + Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from "a" and an 8-bit round constant specified in "imm8", and store the result in "dst"." X3[31:0] := a[127:96] X2[31:0] := a[95:64] @@ -133,14 +122,14 @@ dst[63:32] := RotWord(SubWord(X1)) XOR RCON dst[95:64] := SubWord(X3) dst[127:96] := RotWord(SubWord(X3)) XOR RCON - + + AES
wmmintrin.h
-
- - Tile - Floating Point - AMXBF16 - Application-Targeted + Cryptography + + + + @@ -159,13 +148,14 @@ ENDFOR zero_upper_rows(dst, dst.rows) zero_tileconfig_start() - + + AMXBF16
immintrin.h
-
- - Tile - AMXINT8 Application-Targeted + + + + @@ -191,13 +181,12 @@ ENDFOR zero_upper_rows(dst, dst.rows) zero_tileconfig_start() - -
immintrin.h
-
- - Tile + AMXINT8 +
immintrin.h
Application-Targeted +
+ @@ -223,13 +212,12 @@ ENDFOR zero_upper_rows(dst, dst.rows) zero_tileconfig_start() - -
immintrin.h
-
- - Tile + AMXINT8 +
immintrin.h
Application-Targeted +
+ @@ -255,13 +243,12 @@ ENDFOR zero_upper_rows(dst, dst.rows) zero_tileconfig_start() - -
immintrin.h
-
- - Tile + AMXINT8 +
immintrin.h
Application-Targeted +
+ @@ -287,15 +274,16 @@ ENDFOR zero_upper_rows(dst, dst.rows) zero_tileconfig_start() - + + AMXINT8
immintrin.h
-
- - Tile - AMXTILE Application-Targeted + + + + - + Load tile configuration from a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If the specified pallette_id is zero, that signifies the init state for both the tile config and the tile data, and the tiles are zeroed. Any invalid configurations will result in #GP fault. // format of memory payload. each field is a byte. @@ -313,15 +301,14 @@ zero_tileconfig_start() // ... // 63: tile15.rows - -
immintrin.h
-
- - Tile + AMXTILE +
immintrin.h
Application-Targeted +
+ - + Stores the current tile configuration to a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If tiles are not configured, all zeroes will be stored to memory. // format of memory payload. each field is a byte. @@ -339,17 +326,16 @@ zero_tileconfig_start() // ... // 63: tile15.rows - -
immintrin.h
-
- - Tile + AMXTILE +
immintrin.h
Application-Targeted +
+ - + Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig". start := tileconfig.startRow IF start == 0 // not restarting, zero incoming state @@ -364,17 +350,16 @@ OD zero_upper_rows(dst, dst.rows) zero_tileconfig_start() - -
immintrin.h
-
- - Tile + AMXTILE +
immintrin.h
Application-Targeted +
+ - + Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig". This intrinsic provides a hint to the implementation that the data will likely not be reused in the near future and the data caching can be optimized accordingly. start := tileconfig.startRow IF start == 0 // not restarting, zero incoming state @@ -389,26 +374,24 @@ OD zero_upper_rows(dst, dst.rows) zero_tileconfig_start() - -
immintrin.h
-
- - Tile + AMXTILE +
immintrin.h
Application-Targeted +
+ Release the tile configuration to return to the init state, which releases all storage it currently holds. -
immintrin.h
-
- - Tile AMXTILE +
immintrin.h
Application-Targeted +
+ - + - + Store the tile specified by "src" to memory specifieid by "base" address and "stride" using the tile configuration previously configured via "_tile_loadconfig". start := tileconfig.startRow DO WHILE start < src.rows @@ -418,13 +401,12 @@ DO WHILE start < src.rows OD zero_tileconfig_start() - -
immintrin.h
-
- - Tile + AMXTILE +
immintrin.h
Application-Targeted +
+ Zero the tile specified by "tdest". @@ -435,2431 +417,2116 @@ FOR i := 0 TO palette_table[tileconfig.palette_id].max_rows-1 ENDFOR ENDFOR - + + AMXTILE
immintrin.h
-
- - Floating Point - AVX - Arithmetic - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + Application-Targeted + + + + + + + Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] + dst[i+63:i] := ACOS(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := ACOS(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] + b[i+63:i] - FI + dst[i+63:i] := ACOSH(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] + b[i+31:i] - FI + dst[i+31:i] := ACOSH(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Logical - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) + dst[i+63:i] := ASIN(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Logical - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) + dst[i+31:i] := ASIN(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Logical - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + dst[i+63:i] := ASINH(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Logical - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + dst[i+31:i] := ASINH(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*64 - IF imm8[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI + dst[i+63:i] := ATAN(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 i := j*32 - IF imm8[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI + dst[i+31:i] := ATAN(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. FOR j := 0 to 3 i := j*64 - IF mask[i+63] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI + dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. FOR j := 0 to 7 i := j*32 - IF mask[i+31] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI + dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 - i := 64*j - dst[i+63:i] := a[i+63:i] / b[i+63:i] + i := j*64 + dst[i+63:i] := ATANH(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := a[i+31:i] / b[i+31:i] + i := j*32 + dst[i+31:i] := ATANH(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - - Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". - -DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { - FOR j := 0 to 3 - i := j*32 - IF imm8[(4+j)%8] - temp[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - temp[i+31:i] := FP32(0.0) - FI - ENDFOR - - sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0]) - - FOR j := 0 to 3 - i := j*32 - IF imm8[j%8] - tmpdst[i+31:i] := sum[31:0] - ELSE - tmpdst[i+31:i] := FP32(0.0) - FI - ENDFOR - RETURN tmpdst[127:0] -} -dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) -dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0]) -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Arithmetic - - - - Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". + Trigonometry + + + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -dst[63:0] := a[127:64] + a[63:0] -dst[127:64] := b[127:64] + b[63:0] -dst[191:128] := a[255:192] + a[191:128] -dst[255:192] := b[255:192] + b[191:128] +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := COS(a[i+63:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -dst[31:0] := a[63:32] + a[31:0] -dst[63:32] := a[127:96] + a[95:64] -dst[95:64] := b[63:32] + b[31:0] -dst[127:96] := b[127:96] + b[95:64] -dst[159:128] := a[191:160] + a[159:128] -dst[191:160] := a[255:224] + a[223:192] -dst[223:192] := b[191:160] + b[159:128] -dst[255:224] := b[255:224] + b[223:192] +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := COS(a[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[63:0] := a[63:0] - a[127:64] -dst[127:64] := b[63:0] - b[127:64] -dst[191:128] := a[191:128] - a[255:192] -dst[255:192] := b[191:128] - b[255:192] +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := COSD(a[i+63:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[31:0] := a[31:0] - a[63:32] -dst[63:32] := a[95:64] - a[127:96] -dst[95:64] := b[31:0] - b[63:32] -dst[127:96] := b[95:64] - b[127:96] -dst[159:128] := a[159:128] - a[191:160] -dst[191:160] := a[223:192] - a[255:224] -dst[223:192] := b[159:128] - b[191:160] -dst[255:224] := b[223:192] - b[255:224] +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := COSD(a[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Special Math Functions - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + dst[i+63:i] := COSH(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Special Math Functions - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := COSH(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Special Math Functions - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Special Math Functions - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := SIN(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := SIN(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Logical - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+63:i] := SIN(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Logical - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] + dst[i+31:i] := SIN(a[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". - -dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SIND(a[i+63:i]) +ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Trigonometry + + + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SIND(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Swizzle - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SINH(a[i+63:i]) +ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Trigonometry + + + + + Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SINH(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Arithmetic - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+63:i] := TAN(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Arithmetic - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := TAN(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := TAND(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Trigonometry + + + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := TAND(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Logical - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := TANH(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Logical - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[i+31:i] := TANH(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 1 +
immintrin.h
+ Trigonometry +
+ + + + Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 + dst[i+63:i] := CubeRoot(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := CubeRoot(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CEXP(a[31:0], b[31:0]) { + result[31:0] := POW(FP32(e), a[31:0]) * COS(b[31:0]) + result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0]) + RETURN result +} FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 + dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32]) ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CLOG(a[31:0], b[31:0]) { + result[31:0] := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0))) + result[63:32] := ATAN2(b, a) + RETURN result +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32]) +ENDFOR +dst[MAX:256] := 0 + AVX - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CSQRT(a[31:0], b[31:0]) { + sign[31:0] := (b < 0.0) ? -FP32(1.0) : FP32(1.0) + result[31:0] := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) + result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) + RETURN result +} FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 + i := j*64 + dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Compare - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Compare - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] -dst[MAX:128] := 0 + Elementary Math Functions + + + + + Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POW(10.0, a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer AVX - Convert - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - m := j*64 - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point - Integer + Elementary Math Functions + + + + + Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POW(2.0, a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Convert - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + i := j*32 + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer AVX - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := InvCubeRoot(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := InvCubeRoot(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 32*j - dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := InvSQRT(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point - Integer + Elementary Math Functions + + + + + Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := InvSQRT(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point - Integer + Elementary Math Functions + + + + + Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer AVX - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := LOG(1.0 + a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LOG(1.0 + a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Integer + Elementary Math Functions + + + + + Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) +ENDFOR +dst[MAX:256] := 0 + AVX - Swizzle - - - - Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) +ENDFOR +dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Integer + Elementary Math Functions + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Swizzle - - - - Extract a 32-bit integer from "a", selected with "index", and store the result in "dst". - -dst[31:0] := (a[255:0] >> (index[2:0] * 32))[31:0] +
immintrin.h
+ Elementary Math Functions +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX
immintrin.h
-
- - Integer + Elementary Math Functions + + + + + + Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Swizzle - - - - Extract a 64-bit integer from "a", selected with "index", and store the result in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". -dst[63:0] := (a[255:0] >> (index[1:0] * 64))[63:0] +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX
immintrin.h
-
- + Elementary Math Functions + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - General Support - - - Zero the contents of all XMM or YMM registers. - YMM0[MAX:0] := 0 -YMM1[MAX:0] := 0 -YMM2[MAX:0] := 0 -YMM3[MAX:0] := 0 -YMM4[MAX:0] := 0 -YMM5[MAX:0] := 0 -YMM6[MAX:0] := 0 -YMM7[MAX:0] := 0 -IF _64_BIT_MODE - YMM8[MAX:0] := 0 - YMM9[MAX:0] := 0 - YMM10[MAX:0] := 0 - YMM11[MAX:0] := 0 - YMM12[MAX:0] := 0 - YMM13[MAX:0] := 0 - YMM14[MAX:0] := 0 - YMM15[MAX:0] := 0 -FI - - -
immintrin.h
-
- - AVX - General Support - - - Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified. - YMM0[MAX:128] := 0 -YMM1[MAX:128] := 0 -YMM2[MAX:128] := 0 -YMM3[MAX:128] := 0 -YMM4[MAX:128] := 0 -YMM5[MAX:128] := 0 -YMM6[MAX:128] := 0 -YMM7[MAX:128] := 0 -IF _64_BIT_MODE - YMM8[MAX:128] := 0 - YMM9[MAX:128] := 0 - YMM10[MAX:128] := 0 - YMM11[MAX:128] := 0 - YMM12[MAX:128] := 0 - YMM13[MAX:128] := 0 - YMM14[MAX:128] := 0 - YMM15[MAX:128] := 0 -FI - -
immintrin.h
-
- - Floating Point - AVX - Swizzle - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". + Elementary Math Functions + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], b[1:0]) -dst[63:32] := SELECT4(a[127:0], b[33:32]) -dst[95:64] := SELECT4(a[127:0], b[65:64]) -dst[127:96] := SELECT4(a[127:0], b[97:96]) -dst[159:128] := SELECT4(a[255:128], b[129:128]) -dst[191:160] := SELECT4(a[255:128], b[161:160]) -dst[223:192] := SELECT4(a[255:128], b[193:192]) -dst[255:224] := SELECT4(a[255:128], b[225:224]) +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], b[1:0]) -dst[63:32] := SELECT4(a[127:0], b[33:32]) -dst[95:64] := SELECT4(a[127:0], b[65:64]) -dst[127:96] := SELECT4(a[127:0], b[97:96]) -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Swizzle - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -dst[255:224] := SELECT4(a[255:128], imm8[7:6]) + Elementary Math Functions + + + + + Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := CDFNormal(a[i+63:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Swizzle - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". - -IF (b[1] == 0) dst[63:0] := a[63:0]; FI -IF (b[1] == 1) dst[63:0] := a[127:64]; FI -IF (b[65] == 0) dst[127:64] := a[63:0]; FI -IF (b[65] == 1) dst[127:64] := a[127:64]; FI -IF (b[129] == 0) dst[191:128] := a[191:128]; FI -IF (b[129] == 1) dst[191:128] := a[255:192]; FI -IF (b[193] == 0) dst[255:192] := a[191:128]; FI -IF (b[193] == 1) dst[255:192] := a[255:192]; FI + Probability/Statistics + + + + + Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := CDFNormal(a[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". - -IF (b[1] == 0) dst[63:0] := a[63:0]; FI -IF (b[1] == 1) dst[63:0] := a[127:64]; FI -IF (b[65] == 0) dst[127:64] := a[63:0]; FI -IF (b[65] == 1) dst[127:64] := a[127:64]; FI -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Swizzle - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI + Probability/Statistics + + + + + Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := InverseCDFNormal(a[i+63:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". - -IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Swizzle - - - - - Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src1, src2, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src1[127:0] - 1: tmp[127:0] := src1[255:128] - 2: tmp[127:0] := src2[127:0] - 3: tmp[127:0] := src2[255:128] - ESAC - IF control[3] - tmp[127:0] := 0 - FI - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) -dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) + Probability/Statistics + + + + + Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := InverseCDFNormal(a[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - - Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src1, src2, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src1[127:0] - 1: tmp[127:0] := src1[255:128] - 2: tmp[127:0] := src2[127:0] - 3: tmp[127:0] := src2[255:128] - ESAC - IF control[3] - tmp[127:0] := 0 - FI - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) -dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ERF(a[i+63:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer AVX - Swizzle - - - - - Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src1, src2, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src1[127:0] - 1: tmp[127:0] := src1[255:128] - 2: tmp[127:0] := src2[127:0] - 3: tmp[127:0] := src2[255:128] - ESAC - IF control[3] - tmp[127:0] := 0 - FI - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) -dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ERF(a[i+31:i]) +ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Probability/Statistics + + + + + Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := 1.0 - ERF(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Load - - - Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". - -tmp[31:0] := MEM[mem_addr+31:mem_addr] -FOR j := 0 to 7 +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 i := j*32 - dst[i+31:i] := tmp[31:0] + dst[i+63:i] := 1.0 - ERF(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Probability/Statistics + + + + + Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) +ENDFOR +dst[MAX:256] := 0 + AVX - Load - Swizzle - - - Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". - -tmp[31:0] := MEM[mem_addr+31:mem_addr] -FOR j := 0 to 3 +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 i := j*32 - dst[i+31:i] := tmp[31:0] + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Load - Swizzle - - - Broadcast a double-precision (64-bit) floating-point element from memory to all elements of "dst". - -tmp[63:0] := MEM[mem_addr+63:mem_addr] -FOR j := 0 to 3 +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 i := j*64 - dst[i+63:i] := tmp[63:0] + dst[i+63:i] := 1.0 / ERF(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Load - Swizzle - - - Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of "dst". - -tmp[127:0] := MEM[mem_addr+127:mem_addr] -dst[127:0] := tmp[127:0] -dst[255:128] := tmp[127:0] +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+63:i] := 1.0 / ERF(a[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Load - Swizzle - - - Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of "dst". +
immintrin.h
+ Probability/Statistics +
+ + + + + Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -tmp[127:0] := MEM[mem_addr+127:mem_addr] -dst[127:0] := tmp[127:0] -dst[255:128] := tmp[127:0] +FOR j := 0 to 31 + i := 8*j + IF b[i+7:i] == 0 + #DE + FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC +FOR j := 0 to 15 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Swizzle - - - - - Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -dst[255:0] := a[255:0] -CASE imm8[0] OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC +FOR j := 0 to 7 + i := 32*j + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer AVX - Swizzle - - - - - Copy "a" to "dst", then insert 128 bits from "b" into "dst" at the location specified by "imm8". +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC +FOR j := 0 to 3 + i := 64*j + IF b[i+63:i] == 0 + #DE + FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer AVX - Swizzle - - - - - Copy "a" to "dst", and insert the 8-bit integer "i" into "dst" at the location specified by "index". - -dst[255:0] := a[255:0] -sel := index[4:0]*8 -dst[sel+7:sel] := i[7:0] -
immintrin.h
-
- - Integer - AVX - Swizzle - - - - - Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "index". + Arithmetic + + + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -dst[255:0] := a[255:0] -sel := index[3:0]*16 -dst[sel+15:sel] := i[15:0] +FOR j := 0 to 31 + i := 8*j + IF b[i+7:i] == 0 + #DE + FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 -
immintrin.h
-
- - Integer AVX - Swizzle - - - - - Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "index". - -dst[255:0] := a[255:0] -sel := index[2:0]*32 -dst[sel+31:sel] := i[31:0] -
immintrin.h
-
- - Integer - AVX - Swizzle - - - - - Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "index". + Arithmetic + + + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -dst[255:0] := a[255:0] -sel := index[1:0]*64 -dst[sel+63:sel] := i[63:0] +FOR j := 0 to 15 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Load - - - Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -dst[255:0] := MEM[mem_addr+255:mem_addr] +FOR j := 0 to 7 + i := 32*j + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Store - - - - Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - -
immintrin.h
-
- - Floating Point - AVX - Load - - - Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -dst[255:0] := MEM[mem_addr+255:mem_addr] +FOR j := 0 to 3 + i := 64*j + IF b[i+63:i] == 0 + #DE + FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Store - - - - Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - -
immintrin.h
-
- - Floating Point - AVX - Load - - - Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] + Arithmetic + + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Store - - - - Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - -
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Load - - - Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 31 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Store - - - - Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 15 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Integer + Arithmetic + + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Load - - - Load 256-bits of integer data from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -dst[255:0] := MEM[mem_addr+255:mem_addr] +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 3 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Integer + Arithmetic + + + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 31 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Store - - - - Store 256-bits of integer data from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 15 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Integer + Arithmetic + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Load - - - Load 256-bits of integer data from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 3 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Integer + Arithmetic + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Store - - - - Store 256-bits of integer data from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Load - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). +
immintrin.h
+ Arithmetic +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. FOR j := 0 to 3 i := j*64 - IF mask[i+63] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := CEIL(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Store - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. FOR j := 0 to 3 i := j*64 - IF mask[i+63] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI + dst[i+63:i] := FLOOR(a[i+63:i]) ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Load - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. -FOR j := 0 to 1 - i := j*64 - IF mask[i+63] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Store - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 - IF mask[i+63] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI + dst[i+63:i] := ROUND(a[i+63:i]) ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point AVX - Load - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. FOR j := 0 to 7 i := j*32 - IF mask[i+31] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI + dst[i+31:i] := ROUND(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := TRUNCATE(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + AVX - Store - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". +
immintrin.h
+ Miscellaneous +
+ + + + Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := TRUNCATE(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Miscellaneous +
+ + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 7 i := j*32 - IF mask[i+31] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI + dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Load - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). +
immintrin.h
+ Arithmetic +
+ + + + + Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". FOR j := 0 to 3 - i := j*32 - IF mask[i+31] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := a[i+63:i] + b[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Store - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". +
immintrin.h
+ Arithmetic +
+ + + + + Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - IF mask[i+31] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + IF ((j & 1) == 0) + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + b[i+31:i] FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Move - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". -dst[31:0] := a[63:32] -dst[63:32] := a[63:32] -dst[95:64] := a[127:96] -dst[127:96] := a[127:96] -dst[159:128] := a[191:160] -dst[191:160] := a[191:160] -dst[223:192] := a[255:224] -dst[255:224] := a[255:224] +FOR j := 0 to 3 + i := 64*j + dst[i+63:i] := a[i+63:i] / b[i+63:i] +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Move - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". -dst[31:0] := a[31:0] -dst[63:32] := a[31:0] -dst[95:64] := a[95:64] -dst[127:96] := a[95:64] -dst[159:128] := a[159:128] -dst[191:160] := a[159:128] -dst[223:192] := a[223:192] -dst[255:224] := a[223:192] +FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := a[i+31:i] / b[i+31:i] +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Move - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". -dst[63:0] := a[63:0] -dst[127:64] := a[63:0] -dst[191:128] := a[191:128] -dst[255:192] := a[191:128] +DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { + FOR j := 0 to 3 + i := j*32 + IF imm8[(4+j)%8] + temp[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + temp[i+31:i] := FP32(0.0) + FI + ENDFOR + + sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0]) + + FOR j := 0 to 3 + i := j*32 + IF imm8[j%8] + tmpdst[i+31:i] := sum[31:0] + ELSE + tmpdst[i+31:i] := FP32(0.0) + FI + ENDFOR + RETURN tmpdst[127:0] +} +dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) +dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX - Load - - - Load 256-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm256_loadu_si256" when the data crosses a cache line boundary. +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". -dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[63:0] := a[127:64] + a[63:0] +dst[127:64] := b[127:64] + b[63:0] +dst[191:128] := a[255:192] + a[191:128] +dst[255:192] := b[255:192] + b[191:128] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX - Store - - - - Store 256-bits of integer data from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". -MEM[mem_addr+255:mem_addr] := a[255:0] +dst[31:0] := a[63:32] + a[31:0] +dst[63:32] := a[127:96] + a[95:64] +dst[95:64] := b[63:32] + b[31:0] +dst[127:96] := b[127:96] + b[95:64] +dst[159:128] := a[191:160] + a[159:128] +dst[191:160] := a[255:224] + a[223:192] +dst[223:192] := b[191:160] + b[159:128] +dst[255:224] := b[255:224] + b[223:192] +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Store - - - - Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". -MEM[mem_addr+255:mem_addr] := a[255:0] +dst[63:0] := a[63:0] - a[127:64] +dst[127:64] := b[63:0] - b[127:64] +dst[191:128] := a[191:128] - a[255:192] +dst[255:192] := b[191:128] - b[255:192] +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Store - - - - Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". -MEM[mem_addr+255:mem_addr] := a[255:0] +dst[31:0] := a[31:0] - a[63:32] +dst[63:32] := a[95:64] - a[127:96] +dst[95:64] := b[31:0] - b[63:32] +dst[127:96] := b[95:64] - b[127:96] +dst[159:128] := a[159:128] - a[191:160] +dst[191:160] := a[223:192] - a[255:224] +dst[223:192] := b[159:128] - b[191:160] +dst[255:224] := b[223:192] - b[255:224] +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := 1.0 / a[i+31:i] +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] * b[i+63:i] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + dst[i+31:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) + dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) + dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Special Math Functions - - - - Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". - [round_note] +
immintrin.h
+ Arithmetic +
+ + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ROUND(a[i+63:i], rounding) + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Special Math Functions - - - - Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". - [round_note] +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ROUND(a[i+31:i], rounding) + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Swizzle - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Swizzle - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Swizzle - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR dst[MAX:256] := 0 - + + AVX
immintrin.h
-
- - Floating Point + Logical + + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + AVX - Swizzle - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR dst[MAX:256] := 0 - + + AVX
immintrin.h
-
- - Integer - Flag + Logical + + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. IF ((a[255:0] AND b[255:0]) == 0) @@ -2874,17 +2541,15 @@ ELSE FI RETURN ZF - -
immintrin.h
-
- - Integer - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. IF ((a[255:0] AND b[255:0]) == 0) @@ -2899,17 +2564,15 @@ ELSE FI RETURN CF - -
immintrin.h
-
- - Integer - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. IF ((a[255:0] AND b[255:0]) == 0) @@ -2928,17 +2591,15 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. tmp[255:0] := a[255:0] AND b[255:0] @@ -2955,17 +2616,15 @@ ELSE FI dst := ZF - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. tmp[255:0] := a[255:0] AND b[255:0] @@ -2982,17 +2641,15 @@ ELSE FI dst := CF - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. tmp[255:0] := a[255:0] AND b[255:0] @@ -3013,17 +2670,15 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. tmp[127:0] := a[127:0] AND b[127:0] @@ -3040,17 +2695,15 @@ ELSE FI dst := ZF - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. tmp[127:0] := a[127:0] AND b[127:0] @@ -3067,17 +2720,15 @@ ELSE FI dst := CF - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. tmp[127:0] := a[127:0] AND b[127:0] @@ -3098,17 +2749,15 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. tmp[255:0] := a[255:0] AND b[255:0] @@ -3127,17 +2776,15 @@ ELSE FI dst := ZF - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. tmp[255:0] := a[255:0] AND b[255:0] @@ -3156,17 +2803,15 @@ ELSE FI dst := CF - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. tmp[255:0] := a[255:0] AND b[255:0] @@ -3189,17 +2834,15 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. tmp[127:0] := a[127:0] AND b[127:0] @@ -3216,17 +2859,15 @@ ELSE FI dst := ZF - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. tmp[127:0] := a[127:0] AND b[127:0] @@ -3243,17 +2884,15 @@ ELSE FI dst := CF - -
immintrin.h
-
- - Floating Point - Flag + AVX +
immintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. tmp[127:0] := a[127:0] AND b[127:0] @@ -3274,3164 +2913,2910 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Floating Point + AVX - Miscellaneous - - - Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". +
immintrin.h
+ Logical +
+ + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". FOR j := 0 to 3 i := j*64 - IF a[i+63] - dst[j] := 1 + IF imm8[j] + dst[i+63:i] := b[i+63:i] ELSE - dst[j] := 0 + dst[i+63:i] := a[i+63:i] FI ENDFOR -dst[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Miscellaneous - - - Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". FOR j := 0 to 7 i := j*32 - IF a[i+31] - dst[j] := 1 + IF imm8[j] + dst[i+31:i] := b[i+31:i] ELSE - dst[j] := 0 + dst[i+31:i] := a[i+31:i] FI ENDFOR -dst[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Set - - - Return vector of type __m256d with all elements set to zero. - -dst[MAX:0] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Set - - - Return vector of type __m256 with all elements set to zero. + Swizzle + + + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". -dst[MAX:0] := 0 +FOR j := 0 to 3 + i := j*64 + IF mask[i+63] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX - Set - - - Return vector of type __m256i with all elements set to zero. - -dst[MAX:0] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Set - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. + Swizzle + + + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". -dst[63:0] := e0 -dst[127:64] := e1 -dst[191:128] := e2 -dst[255:192] := e3 +FOR j := 0 to 7 + i := j*32 + IF mask[i+31] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Set - - - - - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. - -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 -dst[159:128] := e4 -dst[191:160] := e5 -dst[223:192] := e6 -dst[255:224] := e7 -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer - AVX - Set - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values. + Swizzle + + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". -dst[7:0] := e0 -dst[15:8] := e1 -dst[23:16] := e2 -dst[31:24] := e3 -dst[39:32] := e4 -dst[47:40] := e5 -dst[55:48] := e6 -dst[63:56] := e7 -dst[71:64] := e8 -dst[79:72] := e9 -dst[87:80] := e10 -dst[95:88] := e11 -dst[103:96] := e12 -dst[111:104] := e13 -dst[119:112] := e14 -dst[127:120] := e15 -dst[135:128] := e16 -dst[143:136] := e17 -dst[151:144] := e18 -dst[159:152] := e19 -dst[167:160] := e20 -dst[175:168] := e21 -dst[183:176] := e22 -dst[191:184] := e23 -dst[199:192] := e24 -dst[207:200] := e25 -dst[215:208] := e26 -dst[223:216] := e27 -dst[231:224] := e28 -dst[239:232] := e29 -dst[247:240] := e30 -dst[255:248] := e31 +dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Set - - - - - - - - - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values. +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". -dst[15:0] := e0 -dst[31:16] := e1 -dst[47:32] := e2 -dst[63:48] := e3 -dst[79:64] := e4 -dst[95:80] := e5 -dst[111:96] := e6 -dst[127:112] := e7 -dst[143:128] := e8 -dst[159:144] := e9 -dst[175:160] := e10 -dst[191:176] := e11 -dst[207:192] := e12 -dst[223:208] := e13 -dst[239:224] := e14 -dst[255:240] := e15 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +dst[255:224] := SELECT4(b[255:128], imm8[7:6]) dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Set - - - - - - - - - - Set packed 32-bit integers in "dst" with the supplied values. +
immintrin.h
+ Swizzle +
+ + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 -dst[159:128] := e4 -dst[191:160] := e5 -dst[223:192] := e6 -dst[255:224] := e7 -dst[MAX:256] := 0 +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + AVX - Set - - - - - - Set packed 64-bit integers in "dst" with the supplied values. +
immintrin.h
+ Swizzle +
+ + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". -dst[63:0] := e0 -dst[127:64] := e1 -dst[191:128] := e2 -dst[255:192] := e3 -dst[MAX:256] := 0 +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + AVX - Set - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. +
immintrin.h
+ Swizzle +
+ + + + + Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". -dst[63:0] := e3 -dst[127:64] := e2 -dst[191:128] := e1 -dst[255:192] := e0 -dst[MAX:256] := 0 +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + AVX - Set - - - - - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. +
immintrin.h
+ Swizzle +
+ + + + + Extract a 32-bit integer from "a", selected with "index", and store the result in "dst". -dst[31:0] := e7 -dst[63:32] := e6 -dst[95:64] := e5 -dst[127:96] := e4 -dst[159:128] := e3 -dst[191:160] := e2 -dst[223:192] := e1 -dst[255:224] := e0 -dst[MAX:256] := 0 +dst[31:0] := (a[255:0] >> (index[2:0] * 32))[31:0] -
immintrin.h
-
- - Integer AVX - Set - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values in reverse order. +
immintrin.h
+ Swizzle +
+ + + + + Extract a 64-bit integer from "a", selected with "index", and store the result in "dst". -dst[7:0] := e31 -dst[15:8] := e30 -dst[23:16] := e29 -dst[31:24] := e28 -dst[39:32] := e27 -dst[47:40] := e26 -dst[55:48] := e25 -dst[63:56] := e24 -dst[71:64] := e23 -dst[79:72] := e22 -dst[87:80] := e21 -dst[95:88] := e20 -dst[103:96] := e19 -dst[111:104] := e18 -dst[119:112] := e17 -dst[127:120] := e16 -dst[135:128] := e15 -dst[143:136] := e14 -dst[151:144] := e13 -dst[159:152] := e12 -dst[167:160] := e11 -dst[175:168] := e10 -dst[183:176] := e9 -dst[191:184] := e8 -dst[199:192] := e7 -dst[207:200] := e6 -dst[215:208] := e5 -dst[223:216] := e4 -dst[231:224] := e3 -dst[239:232] := e2 -dst[247:240] := e1 -dst[255:248] := e0 -dst[MAX:256] := 0 +dst[63:0] := (a[255:0] >> (index[1:0] * 64))[63:0] -
immintrin.h
-
- - Integer AVX - Set - - - - - - - - - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values in reverse order. +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". -dst[15:0] := e15 -dst[31:16] := e14 -dst[47:32] := e13 -dst[63:48] := e12 -dst[79:64] := e11 -dst[95:80] := e10 -dst[111:96] := e9 -dst[127:112] := e8 -dst[143:128] := e7 -dst[159:144] := e6 -dst[175:160] := e5 -dst[191:176] := e4 -dst[207:192] := e3 -dst[223:208] := e2 -dst[239:224] := e1 -dst[255:240] := e0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], b[1:0]) +dst[63:32] := SELECT4(a[127:0], b[33:32]) +dst[95:64] := SELECT4(a[127:0], b[65:64]) +dst[127:96] := SELECT4(a[127:0], b[97:96]) +dst[159:128] := SELECT4(a[255:128], b[129:128]) +dst[191:160] := SELECT4(a[255:128], b[161:160]) +dst[223:192] := SELECT4(a[255:128], b[193:192]) +dst[255:224] := SELECT4(a[255:128], b[225:224]) dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Set - - - - - - - - - - Set packed 32-bit integers in "dst" with the supplied values in reverse order. +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". -dst[31:0] := e7 -dst[63:32] := e6 -dst[95:64] := e5 -dst[127:96] := e4 -dst[159:128] := e3 -dst[191:160] := e2 -dst[223:192] := e1 -dst[255:224] := e0 -dst[MAX:256] := 0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], b[1:0]) +dst[63:32] := SELECT4(a[127:0], b[33:32]) +dst[95:64] := SELECT4(a[127:0], b[65:64]) +dst[127:96] := SELECT4(a[127:0], b[97:96]) +dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + AVX - Set - - - - - - Set packed 64-bit integers in "dst" with the supplied values in reverse order. +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". -dst[63:0] := e3 -dst[127:64] := e2 -dst[191:128] := e1 -dst[255:192] := e0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +dst[255:224] := SELECT4(a[255:128], imm8[7:6]) dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Set - - - Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:256] := 0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + AVX - Set - - - Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR +IF (b[1] == 0) dst[63:0] := a[63:0]; FI +IF (b[1] == 1) dst[63:0] := a[127:64]; FI +IF (b[65] == 0) dst[127:64] := a[63:0]; FI +IF (b[65] == 1) dst[127:64] := a[127:64]; FI +IF (b[129] == 0) dst[191:128] := a[191:128]; FI +IF (b[129] == 1) dst[191:128] := a[255:192]; FI +IF (b[193] == 0) dst[255:192] := a[191:128]; FI +IF (b[193] == 1) dst[255:192] := a[255:192]; FI dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Set - - - Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastb". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer - AVX - Set - - - Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate the "vpbroadcastw". + Swizzle + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR -dst[MAX:256] := 0 +IF (b[1] == 0) dst[63:0] := a[63:0]; FI +IF (b[1] == 1) dst[63:0] := a[127:64]; FI +IF (b[65] == 0) dst[127:64] := a[63:0]; FI +IF (b[65] == 1) dst[127:64] := a[127:64]; FI +dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + AVX - Set - - - Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastd". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer - AVX - Set - - - Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". + Swizzle + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR +IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m256d to type __m256. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m256 to type __m256d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - Integer - AVX - Cast - - - Cast vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - Integer - AVX - Cast - - - Cast vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Integer - AVX - Cast - - - Cast vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Integer - AVX - Cast - - - Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m128 to type __m256; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX - Cast - - - Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Integer + AVX - Cast - - - Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point - AVX - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". + Swizzle + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 +IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". + Swizzle + + + + + + + Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) -ENDFOR +DEFINE SELECT4(src1, src2, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src1[127:0] + 1: tmp[127:0] := src1[255:128] + 2: tmp[127:0] := src2[127:0] + 3: tmp[127:0] := src2[255:128] + ESAC + IF control[3] + tmp[127:0] := 0 + FI + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) +dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) -ENDFOR +DEFINE SELECT4(src1, src2, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src1[127:0] + 1: tmp[127:0] := src1[255:128] + 2: tmp[127:0] := src2[127:0] + 3: tmp[127:0] := src2[255:128] + ESAC + IF control[3] + tmp[127:0] := 0 + FI + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) +dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX - General Support - - - Return vector of type __m256 with undefined elements. -
immintrin.h
-
- - Floating Point - AVX - General Support - - - Return vector of type __m256d with undefined elements. -
immintrin.h
-
- - Integer + AVX - General Support - - - Return vector of type __m256i with undefined elements.
immintrin.h
-
- - Floating Point - AVX - Set - - - - Set packed __m256 vector "dst" with the supplied values. + Swizzle + + + + + + + Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] +DEFINE SELECT4(src1, src2, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src1[127:0] + 1: tmp[127:0] := src1[255:128] + 2: tmp[127:0] := src2[127:0] + 3: tmp[127:0] := src2[255:128] + ESAC + IF control[3] + tmp[127:0] := 0 + FI + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) +dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Set - - - - Set packed __m256d vector "dst" with the supplied values. +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX - Set - - - - Set packed __m256i vector "dst" with the supplied values. +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] +dst[255:0] := a[255:0] +CASE imm8[0] OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Set - - - - Set packed __m256 vector "dst" with the supplied values. +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits from "b" into "dst" at the location specified by "imm8". -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX - Set - - - - Set packed __m256d vector "dst" with the supplied values. +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 8-bit integer "i" into "dst" at the location specified by "index". -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] -dst[MAX:256] := 0 +dst[255:0] := a[255:0] +sel := index[4:0]*8 +dst[sel+7:sel] := i[7:0] - -
immintrin.h
-
- - Integer AVX - Set - - - - Set packed __m256i vector "dst" with the supplied values. +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "index". -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] -dst[MAX:256] := 0 +dst[255:0] := a[255:0] +sel := index[3:0]*16 +dst[sel+15:sel] := i[15:0] - -
immintrin.h
-
- - Floating Point AVX - Load - - - - Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst". - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "index". -dst[127:0] := MEM[loaddr+127:loaddr] -dst[255:128] := MEM[hiaddr+127:hiaddr] -dst[MAX:256] := 0 +dst[255:0] := a[255:0] +sel := index[2:0]*32 +dst[sel+31:sel] := i[31:0] -
immintrin.h
-
- - Floating Point AVX - Load - - - - Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst". - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "index". -dst[127:0] := MEM[loaddr+127:loaddr] -dst[255:128] := MEM[hiaddr+127:hiaddr] -dst[MAX:256] := 0 +dst[255:0] := a[255:0] +sel := index[1:0]*64 +dst[sel+63:sel] := i[63:0] -
immintrin.h
-
- - Integer AVX - Load - - - - Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in "dst". - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". -dst[127:0] := MEM[loaddr+127:loaddr] -dst[255:128] := MEM[hiaddr+127:hiaddr] +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Store - - - - - Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory two different 128-bit locations. - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. - -MEM[loaddr+127:loaddr] := a[127:0] -MEM[hiaddr+127:hiaddr] := a[255:128] -
immintrin.h
-
- - Floating Point - AVX - Store - - - - - Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory two different 128-bit locations. - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + Swizzle + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". -MEM[loaddr+127:loaddr] := a[127:0] -MEM[hiaddr+127:hiaddr] := a[255:128] +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Store - - - - - Store the high and low 128-bit halves (each composed of integer data) from "a" into memory two different 128-bit locations. - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. - -MEM[loaddr+127:loaddr] := a[127:0] -MEM[hiaddr+127:hiaddr] := a[255:128] -
immintrin.h
-
- - Floating Point - AVX - Trigonometry - - - Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + Swizzle + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ACOS(a[i+63:i]) -ENDFOR +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ACOS(a[i+31:i]) -ENDFOR +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ACOSH(a[i+63:i]) + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ACOSH(a[i+31:i]) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ASIN(a[i+63:i]) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ASIN(a[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". + [round_note] FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ASINH(a[i+63:i]) + dst[i+63:i] := ROUND(a[i+63:i], rounding) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". + [round_note] FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ASINH(a[i+31:i]) + dst[i+31:i] := ROUND(a[i+31:i], rounding) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ATAN(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Trigonometry - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + Special Math Functions + + + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ATAN(a[i+31:i]) + dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Trigonometry - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + Special Math Functions + + + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) + dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ATANH(a[i+63:i]) + dst[i+63:i] := FLOOR(a[i+63:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ATANH(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Elementary Math Functions - - - Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 + Special Math Functions + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 3 i := j*64 - dst[i+63:i] := CubeRoot(a[i+63:i]) + dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := CubeRoot(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Probability/Statistics - - - Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CDFNormal(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Probability/Statistics - - - Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := CDFNormal(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Probability/Statistics - - - Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 3 + Special Math Functions + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := InverseCDFNormal(a[i+63:i]) + dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + AVX - Probability/Statistics - - - Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := InverseCDFNormal(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". - -DEFINE CEXP(a[31:0], b[31:0]) { - result[31:0] := POW(FP32(e), a[31:0]) * COS(b[31:0]) - result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0]) - RETURN result -} + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 3 i := j*64 - dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32]) + dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". - -DEFINE CLOG(a[31:0], b[31:0]) { - result[31:0] := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0))) - result[63:32] := ATAN2(b, a) - RETURN result -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Trigonometry - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := COS(a[i+63:i]) + i := j*32 + dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 7 i := j*32 - dst[i+31:i] := COS(a[i+31:i]) + dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := COSD(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Trigonometry - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := COSD(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 + Compare + + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX
immintrin.h
-
- - Floating Point + Compare + + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX - Trigonometry - - - Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := COSH(a[i+63:i]) + i := j*32 + m := j*64 + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := COSH(a[i+31:i]) + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". - -DEFINE CSQRT(a[31:0], b[31:0]) { - sign[31:0] := (b < 0.0) ? -FP32(1.0) : FP32(1.0) - result[31:0] := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) - result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) - RETURN result -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer - AVX - Arithmetic - - - - Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + Convert + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 31 - i := 8*j - IF b[i+7:i] == 0 - #DE - FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +FOR j := 0 to 3 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 15 - i := 16*j - IF b[i+15:i] == 0 - #DE - FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer - AVX - Arithmetic - - - - Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + Convert + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 7 i := 32*j - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 3 i := 64*j - IF b[i+63:i] == 0 - #DE - FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) + k := 32*j + dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 31 - i := 8*j - IF b[i+7:i] == 0 - #DE - FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +FOR j := 0 to 3 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". -FOR j := 0 to 15 - i := 16*j - IF b[i+15:i] == 0 - #DE - FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +FOR j := 0 to 3 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". FOR j := 0 to 7 i := 32*j - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Convert +
+ + + + Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". -FOR j := 0 to 3 - i := 64*j - IF b[i+63:i] == 0 - #DE - FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 +dst[31:0] := a[31:0] -
immintrin.h
-
- - Floating Point + AVX - Probability/Statistics - - - Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ERF(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Probability/Statistics - - - Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ERF(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 + Convert + + + + + Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". + +dst[63:0] := a[63:0] -
immintrin.h
-
- - Floating Point + AVX - Probability/Statistics - - - Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := 1.0 - ERF(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Probability/Statistics - - - Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+63:i] := 1.0 - ERF(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 + Convert + + + + + Copy the lower 32-bit integer in "a" to "dst". + +dst[31:0] := a[31:0] -
immintrin.h
-
- - Floating Point + AVX - Probability/Statistics - - - Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + Convert + + + + + Zero the contents of all XMM or YMM registers. + YMM0[MAX:0] := 0 +YMM1[MAX:0] := 0 +YMM2[MAX:0] := 0 +YMM3[MAX:0] := 0 +YMM4[MAX:0] := 0 +YMM5[MAX:0] := 0 +YMM6[MAX:0] := 0 +YMM7[MAX:0] := 0 +IF _64_BIT_MODE + YMM8[MAX:0] := 0 + YMM9[MAX:0] := 0 + YMM10[MAX:0] := 0 + YMM11[MAX:0] := 0 + YMM12[MAX:0] := 0 + YMM13[MAX:0] := 0 + YMM14[MAX:0] := 0 + YMM15[MAX:0] := 0 +FI + + AVX - Probability/Statistics - - - Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) -ENDFOR -dst[MAX:256] := 0 +
immintrin.h
+ General Support +
+ + + + Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified. + YMM0[MAX:128] := 0 +YMM1[MAX:128] := 0 +YMM2[MAX:128] := 0 +YMM3[MAX:128] := 0 +YMM4[MAX:128] := 0 +YMM5[MAX:128] := 0 +YMM6[MAX:128] := 0 +YMM7[MAX:128] := 0 +IF _64_BIT_MODE + YMM8[MAX:128] := 0 + YMM9[MAX:128] := 0 + YMM10[MAX:128] := 0 + YMM11[MAX:128] := 0 + YMM12[MAX:128] := 0 + YMM13[MAX:128] := 0 + YMM14[MAX:128] := 0 + YMM15[MAX:128] := 0 +FI + + AVX
immintrin.h
-
- - Floating Point + General Support + + + + + Return vector of type __m256 with undefined elements. AVX - Probability/Statistics - - - Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := 1.0 / ERF(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + General Support + + + + + Return vector of type __m256d with undefined elements. AVX - Probability/Statistics - - - Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+63:i] := 1.0 / ERF(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + General Support + + + + + Return vector of type __m256i with undefined elements. AVX - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ General Support +
+ + + + Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) +tmp[31:0] := MEM[mem_addr+31:mem_addr] +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := tmp[31:0] ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + Swizzle + + + Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". -FOR j := 0 to 7 +tmp[31:0] := MEM[mem_addr+31:mem_addr] +FOR j := 0 to 3 i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) + dst[i+31:i] := tmp[31:0] ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + Swizzle + + + Broadcast a double-precision (64-bit) floating-point element from memory to all elements of "dst". +tmp[63:0] := MEM[mem_addr+63:mem_addr] FOR j := 0 to 3 i := j*64 - dst[i+63:i] := POW(10.0, a[i+63:i]) + dst[i+63:i] := tmp[63:0] ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + Swizzle + + + Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of "dst". -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) -ENDFOR +tmp[127:0] := MEM[mem_addr+127:mem_addr] +dst[127:0] := tmp[127:0] +dst[255:128] := tmp[127:0] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + Swizzle + + + Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POW(2.0, a[i+63:i]) -ENDFOR +tmp[127:0] := MEM[mem_addr+127:mem_addr] +dst[127:0] := tmp[127:0] +dst[255:128] := tmp[127:0] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) -ENDFOR +dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 -ENDFOR +dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 -ENDFOR +dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) -ENDFOR +dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Load +
+ + + + Load 256-bits of integer data from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) -ENDFOR +dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer - AVX - Arithmetic - - - - - Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR + Load + + + + + Load 256-bits of integer data from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). + +FOR j := 0 to 3 i := j*64 - dst[i+63:i] := InvCubeRoot(a[i+63:i]) + IF mask[i+63] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := InvCubeRoot(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Elementary Math Functions - - - Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 + Load + + + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). + +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := InvSQRT(a[i+63:i]) + IF mask[i+63] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). + +FOR j := 0 to 7 i := j*32 - dst[i+31:i] := InvSQRT(a[i+31:i]) + IF mask[i+31] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Elementary Math Functions - - - Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + Load + + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) + i := j*32 + IF mask[i+31] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + + + Load 256-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm256_loadu_si256" when the data crosses a cache line boundary. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) -ENDFOR +dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst". + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) -ENDFOR +dst[127:0] := MEM[loaddr+127:loaddr] +dst[255:128] := MEM[hiaddr+127:hiaddr] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Elementary Math Functions - - - Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst". + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) -ENDFOR +dst[127:0] := MEM[loaddr+127:loaddr] +dst[255:128] := MEM[hiaddr+127:hiaddr] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Elementary Math Functions - - - Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in "dst". + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LOG(1.0 + a[i+63:i]) -ENDFOR +dst[127:0] := MEM[loaddr+127:loaddr] +dst[255:128] := MEM[hiaddr+127:hiaddr] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Elementary Math Functions - - - Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LOG(1.0 + a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX
immintrin.h
-
- - Floating Point + Store + + + + + + Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX - Elementary Math Functions - - - Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) -ENDFOR -dst[MAX:256] := 0 +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX
immintrin.h
-
- - Floating Point + Store + + + + + + Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX - Elementary Math Functions - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Store +
+ + + + + Store 256-bits of integer data from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) -ENDFOR -dst[MAX:256] := 0 +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX
immintrin.h
-
- - Floating Point + Store + + + + + + Store 256-bits of integer data from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX - Elementary Math Functions - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". + +FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + IF mask[i+63] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Elementary Math Functions - - - - Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + Store + + + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) + IF mask[i+63] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Elementary Math Functions - - - - Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) + IF mask[i+31] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 31 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer - AVX - Arithmetic - - - - Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 15 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) + Store + + + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". + +FOR j := 0 to 3 + i := j*32 + IF mask[i+31] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer - AVX - Arithmetic - - - - Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 3 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 + Store + + + + + + Store 256-bits of integer data from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 31 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Integer - AVX - Arithmetic - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 15 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 + Store + + + + + + Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] -
immintrin.h
-
- - Integer + AVX - Arithmetic - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX
immintrin.h
-
- - Integer + Store + + + + + + + Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory two different 128-bit locations. + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + +MEM[loaddr+127:loaddr] := a[127:0] +MEM[hiaddr+127:hiaddr] := a[255:128] + AVX - Arithmetic - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 3 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 +
immintrin.h
+ Store +
+ + + + + + Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory two different 128-bit locations. + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + +MEM[loaddr+127:loaddr] := a[127:0] +MEM[hiaddr+127:hiaddr] := a[255:128] + AVX
immintrin.h
-
- - Floating Point + Store + + + + + + + Store the high and low 128-bit halves (each composed of integer data) from "a" into memory two different 128-bit locations. + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + +MEM[loaddr+127:loaddr] := a[127:0] +MEM[hiaddr+127:hiaddr] := a[255:128] + AVX - Trigonometry - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Store +
+ + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) -ENDFOR +dst[31:0] := a[63:32] +dst[63:32] := a[63:32] +dst[95:64] := a[127:96] +dst[127:96] := a[127:96] +dst[159:128] := a[191:160] +dst[191:160] := a[191:160] +dst[223:192] := a[255:224] +dst[255:224] := a[255:224] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Move +
+ + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) -ENDFOR +dst[31:0] := a[31:0] +dst[63:32] := a[31:0] +dst[95:64] := a[95:64] +dst[127:96] := a[95:64] +dst[159:128] := a[159:128] +dst[191:160] := a[159:128] +dst[223:192] := a[223:192] +dst[255:224] := a[223:192] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - - Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". +
immintrin.h
+ Move +
+ + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) - MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) -ENDFOR +dst[63:0] := a[63:0] +dst[127:64] := a[63:0] +dst[191:128] := a[191:128] +dst[255:192] := a[191:128] dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - - Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". +
immintrin.h
+ Move +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. FOR j := 0 to 7 i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) + dst[i+31:i] := 1.0 / a[i+31:i] ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SIND(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point - AVX - Trigonometry - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 + Elementary Math Functions + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR j := 0 to 7 i := j*32 - dst[i+31:i] := SIND(a[i+31:i]) + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := SINH(a[i+63:i]) + dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Trigonometry - - - Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := SINH(a[i+31:i]) + dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point + AVX - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. +
immintrin.h
+ Elementary Math Functions +
+ + + + Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) + IF a[i+63] + dst[j] := 1 + ELSE + dst[j] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:4] := 0 -
immintrin.h
-
- - Floating Point + AVX - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) + IF a[i+31] + dst[j] := 1 + ELSE + dst[j] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:8] := 0 + + AVX
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + Return vector of type __m256d with all elements set to zero. + +dst[MAX:0] := 0 + + AVX - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. +
immintrin.h
+ Set +
+ + + + Return vector of type __m256 with all elements set to zero. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 +dst[MAX:0] := 0 + + AVX
immintrin.h
-
- - Floating Point + Set + + + + + Return vector of type __m256i with all elements set to zero. + +dst[MAX:0] := 0 + + AVX - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. +
immintrin.h
+ Set +
+ + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) -ENDFOR +dst[63:0] := e0 +dst[127:64] := e1 +dst[191:128] := e2 +dst[255:192] := e3 dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i]) -ENDFOR +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 +dst[159:128] := e4 +dst[191:160] := e5 +dst[223:192] := e6 +dst[255:224] := e7 dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ROUND(a[i+31:i]) -ENDFOR +dst[7:0] := e0 +dst[15:8] := e1 +dst[23:16] := e2 +dst[31:24] := e3 +dst[39:32] := e4 +dst[47:40] := e5 +dst[55:48] := e6 +dst[63:56] := e7 +dst[71:64] := e8 +dst[79:72] := e9 +dst[87:80] := e10 +dst[95:88] := e11 +dst[103:96] := e12 +dst[111:104] := e13 +dst[119:112] := e14 +dst[127:120] := e15 +dst[135:128] := e16 +dst[143:136] := e17 +dst[151:144] := e18 +dst[159:152] := e19 +dst[167:160] := e20 +dst[175:168] := e21 +dst[183:176] := e22 +dst[191:184] := e23 +dst[199:192] := e24 +dst[207:200] := e25 +dst[215:208] := e26 +dst[223:216] := e27 +dst[231:224] := e28 +dst[239:232] := e29 +dst[247:240] := e30 +dst[255:248] := e31 dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Elementary Math Functions - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd". +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) -ENDFOR +dst[15:0] := e0 +dst[31:16] := e1 +dst[47:32] := e2 +dst[63:48] := e3 +dst[79:64] := e4 +dst[95:80] := e5 +dst[111:96] := e6 +dst[127:112] := e7 +dst[143:128] := e8 +dst[159:144] := e9 +dst[175:160] := e10 +dst[191:176] := e11 +dst[207:192] := e12 +dst[223:208] := e13 +dst[239:224] := e14 +dst[255:240] := e15 dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Elementary Math Functions - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed 32-bit integers in "dst" with the supplied values. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) -ENDFOR +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 +dst[159:128] := e4 +dst[191:160] := e5 +dst[223:192] := e6 +dst[255:224] := e7 dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Trigonometry - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Set +
+ + + + + + + Set packed 64-bit integers in "dst" with the supplied values. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := TAN(a[i+63:i]) -ENDFOR +dst[63:0] := e0 +dst[127:64] := e1 +dst[191:128] := e2 +dst[255:192] := e3 dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Trigonometry - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Set +
+ + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := TAN(a[i+31:i]) -ENDFOR +dst[63:0] := e3 +dst[127:64] := e2 +dst[191:128] := e1 +dst[255:192] := e0 dst[MAX:256] := 0 + AVX
immintrin.h
-
- - Floating Point + Set + + + + + + + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst[31:0] := e7 +dst[63:32] := e6 +dst[95:64] := e5 +dst[127:96] := e4 +dst[159:128] := e3 +dst[191:160] := e2 +dst[223:192] := e1 +dst[255:224] := e0 +dst[MAX:256] := 0 + AVX - Trigonometry - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := TAND(a[i+63:i]) -ENDFOR +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values in reverse order. + +dst[7:0] := e31 +dst[15:8] := e30 +dst[23:16] := e29 +dst[31:24] := e28 +dst[39:32] := e27 +dst[47:40] := e26 +dst[55:48] := e25 +dst[63:56] := e24 +dst[71:64] := e23 +dst[79:72] := e22 +dst[87:80] := e21 +dst[95:88] := e20 +dst[103:96] := e19 +dst[111:104] := e18 +dst[119:112] := e17 +dst[127:120] := e16 +dst[135:128] := e15 +dst[143:136] := e14 +dst[151:144] := e13 +dst[159:152] := e12 +dst[167:160] := e11 +dst[175:168] := e10 +dst[183:176] := e9 +dst[191:184] := e8 +dst[199:192] := e7 +dst[207:200] := e6 +dst[215:208] := e5 +dst[223:216] := e4 +dst[231:224] := e3 +dst[239:232] := e2 +dst[247:240] := e1 +dst[255:248] := e0 dst[MAX:256] := 0 + AVX
immintrin.h
-
- - Floating Point + Set + + + + + + + + + + + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values in reverse order. + +dst[15:0] := e15 +dst[31:16] := e14 +dst[47:32] := e13 +dst[63:48] := e12 +dst[79:64] := e11 +dst[95:80] := e10 +dst[111:96] := e9 +dst[127:112] := e8 +dst[143:128] := e7 +dst[159:144] := e6 +dst[175:160] := e5 +dst[191:176] := e4 +dst[207:192] := e3 +dst[223:208] := e2 +dst[239:224] := e1 +dst[255:240] := e0 +dst[MAX:256] := 0 + AVX - Trigonometry - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := TAND(a[i+31:i]) -ENDFOR +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed 32-bit integers in "dst" with the supplied values in reverse order. + +dst[31:0] := e7 +dst[63:32] := e6 +dst[95:64] := e5 +dst[127:96] := e4 +dst[159:128] := e3 +dst[191:160] := e2 +dst[223:192] := e1 +dst[255:224] := e0 dst[MAX:256] := 0 + AVX
immintrin.h
-
- - Floating Point + Set + + + + + + + + Set packed 64-bit integers in "dst" with the supplied values in reverse order. + +dst[63:0] := e3 +dst[127:64] := e2 +dst[191:128] := e1 +dst[255:192] := e0 +dst[MAX:256] := 0 + AVX - Trigonometry - - - Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Set +
+ + + + Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := TANH(a[i+63:i]) + dst[i+63:i] := a[63:0] ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Trigonometry - - - Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Set +
+ + + + Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := TANH(a[i+31:i]) + dst[i+31:i] := a[31:0] ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Miscellaneous - - - Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := TRUNCATE(a[i+63:i]) +
immintrin.h
+ Set +
+ + + + Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastb". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := a[7:0] ENDFOR dst[MAX:256] := 0 -
immintrin.h
-
- - Floating Point AVX - Miscellaneous - - - Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := TRUNCATE(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX - Arithmetic - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX - Arithmetic - - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX - Arithmetic - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX - Convert - - - Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". - -dst[31:0] := a[31:0] - - -
immintrin.h
-
- - Floating Point - AVX - Convert - - - Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". - -dst[63:0] := a[63:0] - - -
immintrin.h
-
- - Integer - AVX - Convert - - - Copy the lower 32-bit integer in "a" to "dst". - -dst[31:0] := a[31:0] - - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Extract an 8-bit integer from "a", selected with "index", and store the result in "dst". - -dst[7:0] := (a[255:0] >> (index[4:0] * 8))[7:0] - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Extract a 16-bit integer from "a", selected with "index", and store the result in "dst". - -dst[15:0] := (a[255:0] >> (index[3:0] * 16))[15:0] - -
immintrin.h
-
- - Integer - AVX2 - Special Math Functions - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := ABS(a[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Special Math Functions - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". + Set + + + + + Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate the "vpbroadcastw". FOR j := 0 to 15 i := j*16 - dst[i+15:i] := ABS(a[i+15:i]) + dst[i+15:i] := a[15:0] ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Integer - AVX2 - Special Math Functions - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". + Set + + + + + Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastd". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ABS(a[i+31:i]) + dst[i+31:i] := a[31:0] ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". + Set + + + + + Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[63:0] ENDFOR dst[MAX:256] := 0 - + AVX
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". + Set + + + + + + Set packed __m256 vector "dst" with the supplied values. -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] -ENDFOR +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] dst[MAX:256] := 0 - + + AVX
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst". + Set + + + + + + Set packed __m256d vector "dst" with the supplied values. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] dst[MAX:256] := 0 - + + AVX
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst". + Set + + + + + + Set packed __m256i vector "dst" with the supplied values. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] dst[MAX:256] := 0 - + + AVX
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + Set + + + + + + Set packed __m256 vector "dst" with the supplied values. -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) -ENDFOR +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] dst[MAX:256] := 0 - + + AVX
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + Set + + + + + + Set packed __m256d vector "dst" with the supplied values. -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) -ENDFOR +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] dst[MAX:256] := 0 - + + AVX
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + Set + + + + + + Set packed __m256i vector "dst" with the supplied values. -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) -ENDFOR +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] dst[MAX:256] := 0 - + + AVX
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) -ENDFOR -dst[MAX:256] := 0 - - + Set + + + + + Cast vector of type __m256d to type __m256. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX
immintrin.h
-
- - Integer - AVX2 - Miscellaneous - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". - -FOR j := 0 to 1 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - dst[i+127:i] := tmp[127:0] -ENDFOR -dst[MAX:256] := 0 - - + Cast + + + + + Cast vector of type __m256 to type __m256d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX
immintrin.h
-
- - Integer - AVX2 - Logical - - - - Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[255:0] := (a[255:0] AND b[255:0]) -dst[MAX:256] := 0 - - + Cast + + + + + Cast vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX
immintrin.h
-
- - Integer - AVX2 - Logical - - - - Compute the bitwise NOT of 256 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - -dst[255:0] := ((NOT a[255:0]) AND b[255:0]) -dst[MAX:256] := 0 - - + Cast + + + + + Cast vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX
immintrin.h
-
- - Integer - AVX2 - Probability/Statistics - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + Cast + + + + + Cast vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m256; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + + + + Extract an 8-bit integer from "a", selected with "index", and store the result in "dst". -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -ENDFOR -dst[MAX:256] := 0 +dst[7:0] := (a[255:0] >> (index[4:0] * 8))[7:0] - -
immintrin.h
-
- - Integer AVX2 - Probability/Statistics - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Extract a 16-bit integer from "a", selected with "index", and store the result in "dst". -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -ENDFOR -dst[MAX:256] := 0 +dst[15:0] := (a[255:0] >> (index[3:0] * 16))[15:0] - -
immintrin.h
-
- - Integer AVX2 +
immintrin.h
Swizzle - - - - +
+ + + + + Blend packed 16-bit integers from "a" and "b" within 128-bit lanes using control mask "imm8", and store the results in "dst". FOR j := 0 to 15 @@ -6444,17 +5829,16 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - - - +
+ + + + + Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". FOR j := 0 to 3 @@ -6467,17 +5851,16 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - - - +
+ + + + + Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". FOR j := 0 to 7 @@ -6490,17 +5873,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - - - +
+ + + + + Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". FOR j := 0 to 31 @@ -6513,15 +5895,14 @@ FOR j := 0 to 31 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst". FOR j := 0 to 15 @@ -6530,15 +5911,14 @@ FOR j := 0 to 15 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst". FOR j := 0 to 31 @@ -6547,15 +5927,14 @@ FOR j := 0 to 31 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst". FOR j := 0 to 3 @@ -6564,15 +5943,14 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst". FOR j := 0 to 7 @@ -6581,15 +5959,14 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst". FOR j := 0 to 1 @@ -6598,15 +5975,14 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst". FOR j := 0 to 3 @@ -6615,15 +5991,14 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". FOR j := 0 to 1 @@ -6632,15 +6007,14 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". FOR j := 0 to 3 @@ -6649,45 +6023,42 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst". dst[127:0] := a[127:0] dst[255:128] := a[127:0] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst". dst[127:0] := a[127:0] dst[255:128] := a[127:0] dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". FOR j := 0 to 3 @@ -6696,15 +6067,14 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". FOR j := 0 to 7 @@ -6713,15 +6083,14 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst". FOR j := 0 to 7 @@ -6730,15 +6099,14 @@ FOR j := 0 to 7 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Swizzle - - +
+ + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst". FOR j := 0 to 15 @@ -6747,394 +6115,863 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Compare - - - - Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Compare - - - - Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". + Swizzle + + + + + + Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR -dst[MAX:256] := 0 +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Compare - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of integer data) from "b" into "dst" at the location specified by "imm8". -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Compare - - - - Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR +DEFINE SELECT4(src1, src2, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src1[127:0] + 1: tmp[127:0] := src1[255:128] + 2: tmp[127:0] := src2[127:0] + 3: tmp[127:0] := src2[255:128] + ESAC + IF control[3] + tmp[127:0] := 0 + FI + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) +dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst". -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 -ENDFOR +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 + id := idx[i+2:i]*32 + dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Convert - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + Swizzle + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". -FOR j:= 0 to 7 - i := 32*j - k := 16*j - dst[i+31:i] := SignExtend32(a[k+15:k]) +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Convert - - - Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". -FOR j:= 0 to 3 - i := 64*j - k := 16*j - dst[i+63:i] := SignExtend64(a[k+15:k]) -ENDFOR +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +dst[255:224] := SELECT4(a[255:128], imm8[7:6]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Convert - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 8-bit integers in "a" within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". -FOR j:= 0 to 3 - i := 64*j - k := 32*j - dst[i+63:i] := SignExtend64(a[k+31:k]) +FOR j := 0 to 15 + i := j*8 + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[3:0] := b[i+3:i] + dst[i+7:i] := a[index*8+7:index*8] + FI + IF b[128+i+7] == 1 + dst[128+i+7:128+i] := 0 + ELSE + index[3:0] := b[128+i+3:128+i] + dst[128+i+7:128+i] := a[128+index*8+7:128+index*8] + FI ENDFOR dst[MAX:256] := 0 - + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst". + +dst[63:0] := a[63:0] +dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +dst[191:128] := a[191:128] +dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +dst[MAX:256] := 0 + + + AVX2
immintrin.h
-
- - Integer + Swizzle + + + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst". + +dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +dst[127:64] := a[127:64] +dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +dst[255:192] := a[255:192] +dst[MAX:256] := 0 + + AVX2 - Convert - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". -FOR j := 0 to 15 +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 31 i := j*8 - l := j*16 - dst[l+15:l] := SignExtend16(a[i+7:i]) + dst[i+7:i] := ABS(a[i+7:i]) ENDFOR dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Special Math Functions + + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ABS(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX2 - Convert - - - Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[i+31:i] := SignExtend32(a[k+7:k]) + i := j*32 + dst[i+31:i] := ABS(a[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Convert - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". -FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[i+63:i] := SignExtend64(a[k+7:k]) +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ENDFOR dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Special Math Functions + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX2 - Convert - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". FOR j := 0 to 7 - i := 32*j - k := 16*j - dst[i+31:i] := ZeroExtend32(a[k+15:k]) + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Convert - - - Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". -FOR j:= 0 to 3 - i := 64*j - k := 16*j - dst[i+63:i] := ZeroExtend64(a[k+15:k]) +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ENDFOR dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Special Math Functions + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX2 - Convert - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". -FOR j:= 0 to 3 - i := 64*j - k := 32*j - dst[i+63:i] := ZeroExtend64(a[k+31:k]) +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Special Math Functions + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX2 - Convert - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 31 i := j*8 - l := j*16 - dst[l+15:l] := ZeroExtend16(a[i+7:i]) + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ENDFOR dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Special Math Functions + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX2 - Convert - - - Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[i+31:i] := ZeroExtend32(a[k+7:k]) + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Special Math Functions + + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] +ENDFOR +dst[MAX:256] := 0 + + AVX2 - Convert - - - Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst". FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[i+63:i] := ZeroExtend64(a[k+7:k]) + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) ENDFOR dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Arithmetic + + + + + + Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR +dst[MAX:256] := 0 + + AVX2 - Swizzle - - - - Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +ENDFOR +dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Arithmetic + + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +ENDFOR +dst[MAX:256] := 0 + + AVX2 +
immintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". dst[15:0] := a[31:16] + a[15:0] @@ -7155,16 +6992,15 @@ dst[239:224] := b[223:208] + b[207:192] dst[255:240] := b[255:240] + b[239:224] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". dst[31:0] := a[63:32] + a[31:0] @@ -7177,16 +7013,15 @@ dst[223:192] := b[191:160] + b[159:128] dst[255:224] := b[255:224] + b[223:192] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". dst[15:0] := Saturate16(a[31:16] + a[15:0]) @@ -7207,16 +7042,15 @@ dst[239:224] := Saturate16(b[223:208] + b[207:192]) dst[255:240] := Saturate16(b[255:240] + b[239:224]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". dst[15:0] := a[15:0] - a[31:16] @@ -7237,16 +7071,15 @@ dst[239:224] := b[207:192] - b[223:208] dst[255:240] := b[239:224] - b[255:240] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". dst[31:0] := a[31:0] - a[63:32] @@ -7259,16 +7092,15 @@ dst[223:192] := b[159:128] - b[191:160] dst[255:224] := b[223:192] - b[255:224] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". dst[15:0] := Saturate16(a[15:0] - a[31:16]) @@ -7289,2059 +7121,2111 @@ dst[239:224] := Saturate16(b[207:192] - b[223:208]) dst[255:240] := Saturate16(b[239:224] - b[255:240]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". -FOR j := 0 to 1 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". -FOR j := 0 to 3 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". FOR j := 0 to 3 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + i := j*64 + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Load - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + Arithmetic + + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". FOR j := 0 to 3 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + i := j*64 + dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". -FOR j := 0 to 7 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Load - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + Arithmetic + + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". -FOR j := 0 to 3 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX2 - Load - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + Arithmetic + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". -FOR j := 0 to 3 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point - AVX2 - Load - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + Arithmetic + + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". -FOR j := 0 to 3 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Load - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + Arithmetic + + + + + + Multiply the packed signed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Load - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + Arithmetic + + + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". +FOR j := 0 to 31 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +ENDFOR FOR j := 0 to 3 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] + dst[i+63:i+16] := 0 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Swizzle - - - - - Copy "a" to "dst", then insert 128 bits (composed of integer data) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 Arithmetic - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + + + + + + Negate packed signed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +FOR j := 0 to 31 + i := j*8 + IF b[i+7:i] < 0 + dst[i+7:i] := -(a[i+7:i]) + ELSE IF b[i+7:i] == 0 + dst[i+7:i] := 0 + ELSE + dst[i+7:i] := a[i+7:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Arithmetic - - - - Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". +
+ + + + + Negate packed signed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. FOR j := 0 to 15 i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point - AVX2 - Load - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - IF mask[i+63] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX2 - Load - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - IF mask[i+63] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + IF b[i+15:i] < 0 + dst[i+15:i] := -(a[i+15:i]) + ELSE IF b[i+15:i] == 0 + dst[i+15:i] := 0 ELSE - dst[i+63:i] := src[i+63:i] + dst[i+15:i] := a[i+15:i] FI ENDFOR -mask[MAX:256] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Negate packed signed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - m := j*32 - IF mask[i+31] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + IF b[i+31:i] < 0 + dst[i+31:i] := -(a[i+31:i]) + ELSE IF b[i+31:i] == 0 + dst[i+31:i] := 0 ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - m := j*32 - IF mask[i+31] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] ENDFOR -mask[MAX:256] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - m := j*32 - IF mask[i+31] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := a[i+15:i] - b[i+15:i] ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". FOR j := 0 to 7 i := j*32 - m := j*32 - IF mask[i+31] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR -mask[MAX:256] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - IF mask[i+63] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Load - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + Arithmetic + + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". FOR j := 0 to 3 i := j*64 - m := j*32 - IF mask[i+63] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR -mask[MAX:256] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - m := j*64 - IF mask[i+63] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - m := j*64 - IF mask[i+63] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) ENDFOR -mask[MAX:256] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF mask[i+31] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) ENDFOR -mask[MAX:64] := 0 -dst[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Load - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF mask[i+31] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Arithmetic +
+ + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". FOR j := 0 to 1 - i := j*32 - m := j*64 - IF mask[i+31] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + dst[i+127:i] := tmp[127:0] ENDFOR -mask[MAX:64] := 0 -dst[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Miscellaneous +
+ + + + Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF mask[i+31] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 31 + i := j*8 + dst[j] := a[i+7] ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Miscellaneous +
+ + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". + Eight SADs are performed for each 128-bit lane using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". -FOR j := 0 to 1 - i := j*64 - m := j*64 - IF mask[i+63] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 +DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) { + a_offset := imm8[2]*32 + b_offset := imm8[1:0]*32 + FOR j := 0 to 7 + i := j*8 + k := a_offset+i + l := b_offset + tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \ + ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24])) + ENDFOR + RETURN tmp[127:0] +} +dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0]) +dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3]) +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - m := j*64 - IF mask[i+63] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:256] := 0 +dst[7:0] := Saturate8(a[15:0]) +dst[15:8] := Saturate8(a[31:16]) +dst[23:16] := Saturate8(a[47:32]) +dst[31:24] := Saturate8(a[63:48]) +dst[39:32] := Saturate8(a[79:64]) +dst[47:40] := Saturate8(a[95:80]) +dst[55:48] := Saturate8(a[111:96]) +dst[63:56] := Saturate8(a[127:112]) +dst[71:64] := Saturate8(b[15:0]) +dst[79:72] := Saturate8(b[31:16]) +dst[87:80] := Saturate8(b[47:32]) +dst[95:88] := Saturate8(b[63:48]) +dst[103:96] := Saturate8(b[79:64]) +dst[111:104] := Saturate8(b[95:80]) +dst[119:112] := Saturate8(b[111:96]) +dst[127:120] := Saturate8(b[127:112]) +dst[135:128] := Saturate8(a[143:128]) +dst[143:136] := Saturate8(a[159:144]) +dst[151:144] := Saturate8(a[175:160]) +dst[159:152] := Saturate8(a[191:176]) +dst[167:160] := Saturate8(a[207:192]) +dst[175:168] := Saturate8(a[223:208]) +dst[183:176] := Saturate8(a[239:224]) +dst[191:184] := Saturate8(a[255:240]) +dst[199:192] := Saturate8(b[143:128]) +dst[207:200] := Saturate8(b[159:144]) +dst[215:208] := Saturate8(b[175:160]) +dst[223:216] := Saturate8(b[191:176]) +dst[231:224] := Saturate8(b[207:192]) +dst[239:232] := Saturate8(b[223:208]) +dst[247:240] := Saturate8(b[239:224]) +dst[255:248] := Saturate8(b[255:240]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). - -FOR j := 0 to 3 - i := j*32 - IF mask[i+31] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Load - - - - Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - IF mask[i+31] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(a[95:64]) +dst[63:48] := Saturate16(a[127:96]) +dst[79:64] := Saturate16(b[31:0]) +dst[95:80] := Saturate16(b[63:32]) +dst[111:96] := Saturate16(b[95:64]) +dst[127:112] := Saturate16(b[127:96]) +dst[143:128] := Saturate16(a[159:128]) +dst[159:144] := Saturate16(a[191:160]) +dst[175:160] := Saturate16(a[223:192]) +dst[191:176] := Saturate16(a[255:224]) +dst[207:192] := Saturate16(b[159:128]) +dst[223:208] := Saturate16(b[191:160]) +dst[239:224] := Saturate16(b[223:192]) +dst[255:240] := Saturate16(b[255:224]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). +
immintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - IF mask[i+63] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(a[79:64]) +dst[47:40] := SaturateU8(a[95:80]) +dst[55:48] := SaturateU8(a[111:96]) +dst[63:56] := SaturateU8(a[127:112]) +dst[71:64] := SaturateU8(b[15:0]) +dst[79:72] := SaturateU8(b[31:16]) +dst[87:80] := SaturateU8(b[47:32]) +dst[95:88] := SaturateU8(b[63:48]) +dst[103:96] := SaturateU8(b[79:64]) +dst[111:104] := SaturateU8(b[95:80]) +dst[119:112] := SaturateU8(b[111:96]) +dst[127:120] := SaturateU8(b[127:112]) +dst[135:128] := SaturateU8(a[143:128]) +dst[143:136] := SaturateU8(a[159:144]) +dst[151:144] := SaturateU8(a[175:160]) +dst[159:152] := SaturateU8(a[191:176]) +dst[167:160] := SaturateU8(a[207:192]) +dst[175:168] := SaturateU8(a[223:208]) +dst[183:176] := SaturateU8(a[239:224]) +dst[191:184] := SaturateU8(a[255:240]) +dst[199:192] := SaturateU8(b[143:128]) +dst[207:200] := SaturateU8(b[159:144]) +dst[215:208] := SaturateU8(b[175:160]) +dst[223:216] := SaturateU8(b[191:176]) +dst[231:224] := SaturateU8(b[207:192]) +dst[239:232] := SaturateU8(b[223:208]) +dst[247:240] := SaturateU8(b[239:224]) +dst[255:248] := SaturateU8(b[255:240]) +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - - Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). +
immintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - IF mask[i+63] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR +dst[15:0] := SaturateU16(a[31:0]) +dst[31:16] := SaturateU16(a[63:32]) +dst[47:32] := SaturateU16(a[95:64]) +dst[63:48] := SaturateU16(a[127:96]) +dst[79:64] := SaturateU16(b[31:0]) +dst[95:80] := SaturateU16(b[63:32]) +dst[111:96] := SaturateU16(b[95:64]) +dst[127:112] := SaturateU16(b[127:96]) +dst[143:128] := SaturateU16(a[159:128]) +dst[159:144] := SaturateU16(a[191:160]) +dst[175:160] := SaturateU16(a[223:192]) +dst[191:176] := SaturateU16(a[255:224]) +dst[207:192] := SaturateU16(b[159:128]) +dst[223:208] := SaturateU16(b[191:160]) +dst[239:224] := SaturateU16(b[223:192]) +dst[255:240] := SaturateU16(b[255:224]) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Store - - - - - Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). +
immintrin.h
+ Miscellaneous +
+ + + + + Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". -FOR j := 0 to 3 - i := j*32 - IF mask[i+31] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR +dst[255:0] := (a[255:0] AND b[255:0]) +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Store - - - - - Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 256 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". -FOR j := 0 to 7 - i := j*32 - IF mask[i+31] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR +dst[255:0] := ((NOT a[255:0]) AND b[255:0]) +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Store - - - - - Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". -FOR j := 0 to 1 - i := j*64 - IF mask[i+63] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR +dst[255:0] := (a[255:0] OR b[255:0]) +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Store - - - - - Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". -FOR j := 0 to 3 - i := j*64 - IF mask[i+63] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR +dst[255:0] := (a[255:0] XOR b[255:0]) +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Logical +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". FOR j := 0 to 31 i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". FOR j := 0 to 15 i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Special Math Functions - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + Probability/Statistics + + + + + + Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". FOR j := 0 to 31 i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". FOR j := 0 to 15 i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Compare + + + + + + Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR +dst[MAX:256] := 0 + + AVX2 - Special Math Functions - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". FOR j := 0 to 31 i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". FOR j := 0 to 15 i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". FOR j := 0 to 7 i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Compare +
+ + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +FOR j:= 0 to 7 + i := 32*j + k := 16*j + dst[i+31:i] := SignExtend32(a[k+15:k]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Special Math Functions - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Convert +
+ + + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +FOR j:= 0 to 3 + i := 64*j + k := 16*j + dst[i+63:i] := SignExtend64(a[k+15:k]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Miscellaneous - - - Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". +
immintrin.h
+ Convert +
+ + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". -FOR j := 0 to 31 - i := j*8 - dst[j] := a[i+7] +FOR j:= 0 to 3 + i := 64*j + k := 32*j + dst[i+63:i] := SignExtend64(a[k+31:k]) ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Miscellaneous - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". - Eight SADs are performed for each 128-bit lane using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". -DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) { - a_offset := imm8[2]*32 - b_offset := imm8[1:0]*32 - FOR j := 0 to 7 - i := j*8 - k := a_offset+i - l := b_offset - tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \ - ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24])) - ENDFOR - RETURN tmp[127:0] -} -dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0]) -dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3]) +FOR j := 0 to 15 + i := j*8 + l := j*16 + dst[l+15:l] := SignExtend16(a[i+7:i]) +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) +FOR j := 0 to 7 + i := 32*j + k := 8*j + dst[i+31:i] := SignExtend32(a[k+7:k]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+31:i] * b[i+31:i] + i := 64*j + k := 8*j + dst[i+63:i] := SignExtend64(a[k+7:k]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] +FOR j := 0 to 7 + i := 32*j + k := 16*j + dst[i+31:i] := ZeroExtend32(a[k+15:k]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] +FOR j:= 0 to 3 + i := 64*j + k := 16*j + dst[i+63:i] := ZeroExtend64(a[k+15:k]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] +FOR j:= 0 to 3 + i := 64*j + k := 32*j + dst[i+63:i] := ZeroExtend64(a[k+31:k]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". FOR j := 0 to 15 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] + i := j*8 + l := j*16 + dst[l+15:l] := ZeroExtend16(a[i+7:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Multiply the packed signed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 7 - i := j*32 - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] + i := 32*j + k := 8*j + dst[i+31:i] := ZeroExtend32(a[k+7:k]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Logical - - - - Compute the bitwise OR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". -dst[255:0] := (a[255:0] OR b[255:0]) +FOR j := 0 to 3 + i := 64*j + k := 8*j + dst[i+63:i] := ZeroExtend64(a[k+7:k]) +ENDFOR dst[MAX:256] := 0 - + + AVX2
immintrin.h
-
- - Integer + Convert + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:128] := 0 + + AVX2 - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -dst[7:0] := Saturate8(a[15:0]) -dst[15:8] := Saturate8(a[31:16]) -dst[23:16] := Saturate8(a[47:32]) -dst[31:24] := Saturate8(a[63:48]) -dst[39:32] := Saturate8(a[79:64]) -dst[47:40] := Saturate8(a[95:80]) -dst[55:48] := Saturate8(a[111:96]) -dst[63:56] := Saturate8(a[127:112]) -dst[71:64] := Saturate8(b[15:0]) -dst[79:72] := Saturate8(b[31:16]) -dst[87:80] := Saturate8(b[47:32]) -dst[95:88] := Saturate8(b[63:48]) -dst[103:96] := Saturate8(b[79:64]) -dst[111:104] := Saturate8(b[95:80]) -dst[119:112] := Saturate8(b[111:96]) -dst[127:120] := Saturate8(b[127:112]) -dst[135:128] := Saturate8(a[143:128]) -dst[143:136] := Saturate8(a[159:144]) -dst[151:144] := Saturate8(a[175:160]) -dst[159:152] := Saturate8(a[191:176]) -dst[167:160] := Saturate8(a[207:192]) -dst[175:168] := Saturate8(a[223:208]) -dst[183:176] := Saturate8(a[239:224]) -dst[191:184] := Saturate8(a[255:240]) -dst[199:192] := Saturate8(b[143:128]) -dst[207:200] := Saturate8(b[159:144]) -dst[215:208] := Saturate8(b[175:160]) -dst[223:216] := Saturate8(b[191:176]) -dst[231:224] := Saturate8(b[207:192]) -dst[239:232] := Saturate8(b[223:208]) -dst[247:240] := Saturate8(b[239:224]) -dst[255:248] := Saturate8(b[255:240]) +FOR j := 0 to 3 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(a[95:64]) -dst[63:48] := Saturate16(a[127:96]) -dst[79:64] := Saturate16(b[31:0]) -dst[95:80] := Saturate16(b[63:32]) -dst[111:96] := Saturate16(b[95:64]) -dst[127:112] := Saturate16(b[127:96]) -dst[143:128] := Saturate16(a[159:128]) -dst[159:144] := Saturate16(a[191:160]) -dst[175:160] := Saturate16(a[223:192]) -dst[191:176] := Saturate16(a[255:224]) -dst[207:192] := Saturate16(b[159:128]) -dst[223:208] := Saturate16(b[191:160]) -dst[239:224] := Saturate16(b[223:192]) -dst[255:240] := Saturate16(b[255:224]) -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". + Load + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(a[79:64]) -dst[47:40] := SaturateU8(a[95:80]) -dst[55:48] := SaturateU8(a[111:96]) -dst[63:56] := SaturateU8(a[127:112]) -dst[71:64] := SaturateU8(b[15:0]) -dst[79:72] := SaturateU8(b[31:16]) -dst[87:80] := SaturateU8(b[47:32]) -dst[95:88] := SaturateU8(b[63:48]) -dst[103:96] := SaturateU8(b[79:64]) -dst[111:104] := SaturateU8(b[95:80]) -dst[119:112] := SaturateU8(b[111:96]) -dst[127:120] := SaturateU8(b[127:112]) -dst[135:128] := SaturateU8(a[143:128]) -dst[143:136] := SaturateU8(a[159:144]) -dst[151:144] := SaturateU8(a[175:160]) -dst[159:152] := SaturateU8(a[191:176]) -dst[167:160] := SaturateU8(a[207:192]) -dst[175:168] := SaturateU8(a[223:208]) -dst[183:176] := SaturateU8(a[239:224]) -dst[191:184] := SaturateU8(a[255:240]) -dst[199:192] := SaturateU8(b[143:128]) -dst[207:200] := SaturateU8(b[159:144]) -dst[215:208] := SaturateU8(b[175:160]) -dst[223:216] := SaturateU8(b[191:176]) -dst[231:224] := SaturateU8(b[207:192]) -dst[239:232] := SaturateU8(b[223:208]) -dst[247:240] := SaturateU8(b[239:224]) -dst[255:248] := SaturateU8(b[255:240]) -dst[MAX:256] := 0 +FOR j := 0 to 3 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". - -dst[15:0] := SaturateU16(a[31:0]) -dst[31:16] := SaturateU16(a[63:32]) -dst[47:32] := SaturateU16(a[95:64]) -dst[63:48] := SaturateU16(a[127:96]) -dst[79:64] := SaturateU16(b[31:0]) -dst[95:80] := SaturateU16(b[63:32]) -dst[111:96] := SaturateU16(b[95:64]) -dst[127:112] := SaturateU16(b[127:96]) -dst[143:128] := SaturateU16(a[159:128]) -dst[159:144] := SaturateU16(a[191:160]) -dst[175:160] := SaturateU16(a[223:192]) -dst[191:176] := SaturateU16(a[255:224]) -dst[207:192] := SaturateU16(b[159:128]) -dst[223:208] := SaturateU16(b[191:160]) -dst[239:224] := SaturateU16(b[223:192]) -dst[255:240] := SaturateU16(b[255:224]) -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - - Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". + Load + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -DEFINE SELECT4(src1, src2, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src1[127:0] - 1: tmp[127:0] := src1[255:128] - 2: tmp[127:0] := src2[127:0] - 3: tmp[127:0] := src2[255:128] - ESAC - IF control[3] - tmp[127:0] := 0 - FI - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) -dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) +FOR j := 0 to 7 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Swizzle - - - - Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX2 - Swizzle - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst". + Load + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[MAX:256] := 0 +FOR j := 0 to 3 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Swizzle - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 i := j*32 - id := idx[i+2:i]*32 - dst[i+31:i] := a[id+31:id] + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX2 - Swizzle - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - dst[i+31:i] := a[id+31:id] +FOR j := 0 to 1 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 31 - i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -ENDFOR FOR j := 0 to 3 i := j*64 - dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ - tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] - dst[i+63:i+16] := 0 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Swizzle - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Shuffle 8-bit integers in "a" within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". + Load + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 - i := j*8 - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[3:0] := b[i+3:i] - dst[i+7:i] := a[index*8+7:index*8] - FI - IF b[128+i+7] == 1 - dst[128+i+7:128+i] := 0 - ELSE - index[3:0] := b[128+i+3:128+i] - dst[128+i+7:128+i] := a[128+index*8+7:128+index*8] - FI +FOR j := 0 to 1 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Swizzle - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst". +
immintrin.h
+ Load +
+ + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -dst[63:0] := a[63:0] -dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -dst[191:128] := a[191:128] -dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +FOR j := 0 to 3 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Swizzle - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst". +
immintrin.h
+ Load +
+ + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -dst[127:64] := a[127:64] -dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -dst[255:192] := a[255:192] -dst[MAX:256] := 0 +FOR j := 0 to 1 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Negate packed signed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. +
immintrin.h
+ Load +
+ + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 31 - i := j*8 - IF b[i+7:i] < 0 - dst[i+7:i] := -(a[i+7:i]) - ELSE IF b[i+7:i] == 0 - dst[i+7:i] := 0 - ELSE - dst[i+7:i] := a[i+7:i] - FI +FOR j := 0 to 3 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Negate packed signed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 - i := j*16 - IF b[i+15:i] < 0 - dst[i+15:i] := -(a[i+15:i]) - ELSE IF b[i+15:i] == 0 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := a[i+15:i] - FI +FOR j := 0 to 1 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Arithmetic - - - - Negate packed signed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 - IF b[i+31:i] < 0 - dst[i+31:i] := -(a[i+31:i]) - ELSE IF b[i+31:i] == 0 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := a[i+31:i] - FI + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Shift - - - - Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) -dst[255:128] := a[255:128] << (tmp*8) -dst[MAX:256] := 0 +FOR j := 0 to 1 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Shift - - - - Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) -dst[255:128] := a[255:128] << (tmp*8) +FOR j := 0 to 3 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Shift - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 +FOR j := 0 to 1 + i := j*64 + m := j*32 + IF mask[i+63] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +mask[MAX:128] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Shift - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 +FOR j := 0 to 3 + i := j*64 + m := j*32 + IF mask[i+63] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + dst[i+63:i] := src[i+63:i] FI ENDFOR +mask[MAX:256] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Shift - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 + m := j*32 + IF mask[i+31] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +mask[MAX:128] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Shift - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 + m := j*32 + IF mask[i+31] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + dst[i+31:i] := src[i+31:i] FI ENDFOR +mask[MAX:256] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Shift - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + i := j*32 + m := j*32 + IF mask[i+31] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + IF mask[i+31] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + IF mask[i+63] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + IF mask[i+63] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + IF mask[i+63] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + IF mask[i+63] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF mask[i+31] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:64] := 0 +dst[MAX:64] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF mask[i+31] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF mask[i+31] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:64] := 0 +dst[MAX:64] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF mask[i+31] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + IF mask[i+63] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + IF mask[i+63] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). + +FOR j := 0 to 3 + i := j*32 + IF mask[i+31] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). + +FOR j := 0 to 7 + i := j*32 + IF mask[i+31] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). + +FOR j := 0 to 1 + i := j*64 + IF mask[i+63] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). + +FOR j := 0 to 3 + i := j*64 + IF mask[i+63] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + Load 256-bits of integer data from memory into "dst" using a non-temporal memory hint. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + +FOR j := 0 to 3 + i := j*32 + IF mask[i+31] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX2 +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + +FOR j := 0 to 7 + i := j*32 + IF mask[i+31] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX2 +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + +FOR j := 0 to 1 + i := j*64 + IF mask[i+63] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX2 +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + +FOR j := 0 to 3 + i := j*64 + IF mask[i+63] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX2 +
immintrin.h
+ Store +
+ + + + + Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) +dst[255:128] := a[255:128] << (tmp*8) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) +dst[255:128] := a[255:128] << (tmp*8) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) FI ENDFOR dst[MAX:256] := 0 - + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2
immintrin.h
-
- - Integer + Shift + + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". FOR j := 0 to 3 @@ -9354,16 +9238,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 3 @@ -9376,16 +9259,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 7 @@ -9398,16 +9280,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 1 @@ -9420,16 +9301,15 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 3 @@ -9442,16 +9322,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 15 @@ -9464,16 +9343,15 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 15 @@ -9486,16 +9364,15 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 7 @@ -9508,16 +9385,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 7 @@ -9530,16 +9406,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 3 @@ -9552,16 +9427,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 7 @@ -9574,16 +9448,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". tmp := imm8[7:0] @@ -9594,16 +9467,15 @@ dst[127:0] := a[127:0] >> (tmp*8) dst[255:128] := a[255:128] >> (tmp*8) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". tmp := imm8[7:0] @@ -9614,16 +9486,15 @@ dst[127:0] := a[127:0] >> (tmp*8) dst[255:128] := a[255:128] >> (tmp*8) dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 15 @@ -9636,16 +9507,15 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". FOR j := 0 to 15 @@ -9658,16 +9528,15 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 7 @@ -9680,16 +9549,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". FOR j := 0 to 7 @@ -9702,16 +9570,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 3 @@ -9724,16 +9591,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". FOR j := 0 to 3 @@ -9746,16 +9612,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 3 @@ -9768,16 +9633,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 7 @@ -9790,16 +9654,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 1 @@ -9812,16 +9675,15 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX2 +
immintrin.h
Shift - - - +
+ + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 3 @@ -9834,475 +9696,56 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX2 - Load - - - Load 256-bits of integer data from memory into "dst" using a non-temporal memory hint. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + Shift + + + + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] +FOR i := 0 to 1 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] +ENDFOR +FOR j := 0 to 3 + i := j*64 + dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Arithmetic - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Logical - - - - Compute the bitwise XOR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[255:0] := (a[255:0] XOR b[255:0]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX2 - Swizzle - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Mask - AVX512BW - Miscellaneous - - - - Unpack and interleave 32 bits from masks "a" and "b", and store the 64-bit result in "dst". - -dst[31:0] := b[31:0] -dst[63:32] := a[31:0] -dst[MAX:64] := 0 - - -
immintrin.h
-
- - Mask + AVX512BW - Miscellaneous - - - - Unpack and interleave 16 bits from masks "a" and "b", and store the 32-bit result in "dst". - -dst[15:0] := b[15:0] -dst[31:16] := a[15:0] -dst[MAX:32] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Miscellaneous - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 1 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 3 - i := j*64 - dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW Miscellaneous - - - - - - + + + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. @@ -10336,19 +9779,18 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Miscellaneous - - - - - +
+ + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. @@ -10382,145 +9824,17 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512BW - Miscellaneous - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 3 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 7 - i := j*64 - dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512BW - Miscellaneous - - - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 3 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 7 - i := j*64 - tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 3 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 7 - i := j*64 - tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW +
immintrin.h
Miscellaneous - - - - +
+ + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. @@ -10544,20 +9858,19 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Miscellaneous - - - - - - +
+ + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. @@ -10589,19 +9902,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Miscellaneous - - - - - +
+ + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. @@ -10633,1048 +9945,1024 @@ FOR j := 0 to 7 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Load - - - - - Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 1 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + tmp_dst[i+127:i] := tmp[127:0] +ENDFOR +FOR j := 0 to 31 + i := j*8 IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Move - - - - - Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 1 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + tmp_dst[i+127:i] := tmp[127:0] +ENDFOR +FOR j := 0 to 31 + i := j*8 IF k[j] - dst[i+15:i] := a[i+15:i] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Store - - - - - Store packed 16-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp_dst[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) FOR j := 0 to 15 - i := j*16 + i := j*8 IF k[j] - MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Load - - - - Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp_dst[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) FOR j := 0 to 15 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Move - - - - Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k[j] - dst[i+15:i] := a[i+15:i] + dst[i+7:i] := b[i+7:i] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := a[i+7:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Load - - - - - Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + dst[i+7:i] := b[i+7:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := a[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Move - - - - - Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := a[i+15:i] + dst[i+15:i] := b[i+15:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := a[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Store - - - - - Store packed 16-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 IF k[j] - MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] + dst[i+15:i] := b[i+15:i] + ELSE + dst[i+15:i] := a[i+15:i] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Load - - - - Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + dst[i+7:i] := a[7:0] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Move - - - - Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := a[i+15:i] + dst[i+7:i] := a[7:0] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Load - - - - - Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + dst[i+7:i] := a[7:0] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Move - - - - - Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k[j] - dst[i+15:i] := a[i+15:i] + dst[i+7:i] := a[7:0] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Store - - - - - Store packed 16-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k[j] - MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := src[i+15:i] FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Load - - - - Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Move - - - - Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := a[i+15:i] + dst[i+15:i] := a[15:0] ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Load - - - - - Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + dst[i+15:i] := a[15:0] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Move - - - - - Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := a[i+7:i] + off := 16*idx[i+3:i] + dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := idx[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Store - - - - - Store packed 8-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 31 - i := j*8 - IF k[j] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI -ENDFOR - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Load - - - - Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + off := 16*idx[i+3:i] + dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := a[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Move - - - - Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := a[i+7:i] + off := 16*idx[i+3:i] + dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW - Load - - - - - Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] - ELSE - dst[i+7:i] := src[i+7:i] - FI +FOR j := 0 to 15 + i := j*16 + off := 16*idx[i+3:i] + dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW - Move - - - - - Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := a[i+7:i] + off := 16*idx[i+2:i] + dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := idx[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Store - - - - - Store packed 8-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + off := 16*idx[i+2:i] + dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := a[i+15:i] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Load - - - - Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + off := 16*idx[i+2:i] + dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW - Move - - - - Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - ELSE - dst[i+7:i] := 0 - FI +FOR j := 0 to 7 + i := j*16 + off := 16*idx[i+2:i] + dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + + AVX512BW - Load - - - - - Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 + i := j*16 + id := idx[i+3:i]*16 IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + dst[i+15:i] := a[id+15:id] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Move - - - - - Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 + i := j*16 + id := idx[i+3:i]*16 IF k[j] - dst[i+7:i] := a[i+7:i] + dst[i+15:i] := a[id+15:id] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Store - - - - - Store packed 8-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". FOR j := 0 to 15 - i := j*8 - IF k[j] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI + i := j*16 + id := idx[i+3:i]*16 + dst[i+15:i] := a[id+15:id] ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Load - - - - Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 7 + i := j*16 + id := idx[i+2:i]*16 IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + dst[i+15:i] := a[id+15:id] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Move - - - - Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 7 + i := j*16 + id := idx[i+2:i]*16 IF k[j] - dst[i+7:i] := a[i+7:i] + dst[i+15:i] := a[id+15:id] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst". -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI +FOR j := 0 to 7 + i := j*16 + id := idx[i+2:i]*16 + dst[i+15:i] := a[id+15:id] ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". FOR j := 0 to 31 i := j*8 - IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) + IF a[i+7] + k[j] := 1 ELSE - dst[i+7:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := ABS(a[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW - Arithmetic - - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Miscellaneous + + + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) + IF a[i+7] + k[j] := 1 ELSE - dst[i+7:i] := src[i+7:i] + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) + dst[i+7:i] := 0xFF ELSE dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) + dst[i+7:i] := 0xFF ELSE - dst[i+7:i] := src[i+7:i] + dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". FOR j := 0 to 15 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) + dst[i+15:i] := 0xFFFF ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) + dst[i+15:i] := 0xFFFF ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". FOR j := 0 to 15 i := j*16 - IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) + IF a[i+15] + k[j] := 1 ELSE - dst[i+15:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 - dst[i+15:i] := ABS(a[i+15:i]) + IF a[i+15] + k[j] := 1 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[4:0] := b[i+3:i] + (j & 0x10) + dst[i+7:i] := a[index*8+7:index*8] + FI ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[4:0] := b[i+3:i] + (j & 0x10) + dst[i+7:i] := a[index*8+7:index*8] + FI ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[3:0] := b[i+3:i] + dst[i+7:i] := a[index*8+7:index*8] + FI ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[3:0] := b[i+3:i] + dst[i+7:i] := a[index*8+7:index*8] + FI ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -tmp_dst[143:128] := Saturate16(a[159:128]) -tmp_dst[159:144] := Saturate16(a[191:160]) -tmp_dst[175:160] := Saturate16(a[223:192]) -tmp_dst[191:176] := Saturate16(a[255:224]) -tmp_dst[207:192] := Saturate16(b[159:128]) -tmp_dst[223:208] := Saturate16(b[191:160]) -tmp_dst[239:224] := Saturate16(b[223:192]) -tmp_dst[255:240] := Saturate16(b[255:224]) +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +tmp_dst[191:128] := a[191:128] +tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] FOR j := 0 to 15 i := j*16 IF k[j] @@ -11685,37 +10973,29 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -tmp_dst[143:128] := Saturate16(a[159:128]) -tmp_dst[159:144] := Saturate16(a[191:160]) -tmp_dst[175:160] := Saturate16(a[223:192]) -tmp_dst[191:176] := Saturate16(a[255:224]) -tmp_dst[207:192] := Saturate16(b[159:128]) -tmp_dst[223:208] := Saturate16(b[191:160]) -tmp_dst[239:224] := Saturate16(b[223:192]) -tmp_dst[255:240] := Saturate16(b[255:224]) +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +tmp_dst[191:128] := a[191:128] +tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] FOR j := 0 to 15 i := j*16 IF k[j] @@ -11726,54 +11006,26 @@ FOR j := 0 to 15 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -tmp_dst[143:128] := Saturate16(a[159:128]) -tmp_dst[159:144] := Saturate16(a[191:160]) -tmp_dst[175:160] := Saturate16(a[223:192]) -tmp_dst[191:176] := Saturate16(a[255:224]) -tmp_dst[207:192] := Saturate16(b[159:128]) -tmp_dst[223:208] := Saturate16(b[191:160]) -tmp_dst[239:224] := Saturate16(b[223:192]) -tmp_dst[255:240] := Saturate16(b[255:224]) -tmp_dst[271:256] := Saturate16(a[287:256]) -tmp_dst[287:272] := Saturate16(a[319:288]) -tmp_dst[303:288] := Saturate16(a[351:320]) -tmp_dst[319:304] := Saturate16(a[383:352]) -tmp_dst[335:320] := Saturate16(b[287:256]) -tmp_dst[351:336] := Saturate16(b[319:288]) -tmp_dst[367:352] := Saturate16(b[351:320]) -tmp_dst[383:368] := Saturate16(b[383:352]) -tmp_dst[399:384] := Saturate16(a[415:384]) -tmp_dst[415:400] := Saturate16(a[447:416]) -tmp_dst[431:416] := Saturate16(a[479:448]) -tmp_dst[447:432] := Saturate16(a[511:480]) -tmp_dst[463:448] := Saturate16(b[415:384]) -tmp_dst[479:464] := Saturate16(b[447:416]) -tmp_dst[495:480] := Saturate16(b[479:448]) -tmp_dst[511:496] := Saturate16(b[511:480]) -FOR j := 0 to 31 +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] @@ -11781,55 +11033,27 @@ FOR j := 0 to 31 dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -tmp_dst[143:128] := Saturate16(a[159:128]) -tmp_dst[159:144] := Saturate16(a[191:160]) -tmp_dst[175:160] := Saturate16(a[223:192]) -tmp_dst[191:176] := Saturate16(a[255:224]) -tmp_dst[207:192] := Saturate16(b[159:128]) -tmp_dst[223:208] := Saturate16(b[191:160]) -tmp_dst[239:224] := Saturate16(b[223:192]) -tmp_dst[255:240] := Saturate16(b[255:224]) -tmp_dst[271:256] := Saturate16(a[287:256]) -tmp_dst[287:272] := Saturate16(a[319:288]) -tmp_dst[303:288] := Saturate16(a[351:320]) -tmp_dst[319:304] := Saturate16(a[383:352]) -tmp_dst[335:320] := Saturate16(b[287:256]) -tmp_dst[351:336] := Saturate16(b[319:288]) -tmp_dst[367:352] := Saturate16(b[351:320]) -tmp_dst[383:368] := Saturate16(b[383:352]) -tmp_dst[399:384] := Saturate16(a[415:384]) -tmp_dst[415:400] := Saturate16(a[447:416]) -tmp_dst[431:416] := Saturate16(a[479:448]) -tmp_dst[447:432] := Saturate16(a[511:480]) -tmp_dst[463:448] := Saturate16(b[415:384]) -tmp_dst[479:464] := Saturate16(b[447:416]) -tmp_dst[495:480] := Saturate16(b[479:448]) -tmp_dst[511:496] := Saturate16(b[511:480]) -FOR j := 0 to 31 +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] @@ -11837,79 +11061,94 @@ FOR j := 0 to 31 dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". +
+ + + + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(a[95:64]) -dst[63:48] := Saturate16(a[127:96]) -dst[79:64] := Saturate16(b[31:0]) -dst[95:80] := Saturate16(b[63:32]) -dst[111:96] := Saturate16(b[95:64]) -dst[127:112] := Saturate16(b[127:96]) -dst[143:128] := Saturate16(a[159:128]) -dst[159:144] := Saturate16(a[191:160]) -dst[175:160] := Saturate16(a[223:192]) -dst[191:176] := Saturate16(a[255:224]) -dst[207:192] := Saturate16(b[159:128]) -dst[223:208] := Saturate16(b[191:160]) -dst[239:224] := Saturate16(b[223:192]) -dst[255:240] := Saturate16(b[255:224]) -dst[271:256] := Saturate16(a[287:256]) -dst[287:272] := Saturate16(a[319:288]) -dst[303:288] := Saturate16(a[351:320]) -dst[319:304] := Saturate16(a[383:352]) -dst[335:320] := Saturate16(b[287:256]) -dst[351:336] := Saturate16(b[319:288]) -dst[367:352] := Saturate16(b[351:320]) -dst[383:368] := Saturate16(b[383:352]) -dst[399:384] := Saturate16(a[415:384]) -dst[415:400] := Saturate16(a[447:416]) -dst[431:416] := Saturate16(a[479:448]) -dst[447:432] := Saturate16(a[511:480]) -dst[463:448] := Saturate16(b[415:384]) -dst[479:464] := Saturate16(b[447:416]) -dst[495:480] := Saturate16(b[479:448]) -dst[511:496] := Saturate16(b[511:480]) -dst[MAX:512] := 0 +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +tmp_dst[255:192] := a[255:192] +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +tmp_dst[255:192] := a[255:192] +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] FOR j := 0 to 7 i := j*16 IF k[j] @@ -11920,29 +11159,24 @@ FOR j := 0 to 7 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] FOR j := 0 to 7 i := j*16 IF k[j] @@ -11953,54 +11187,41 @@ FOR j := 0 to 7 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -tmp_dst[135:128] := Saturate8(a[143:128]) -tmp_dst[143:136] := Saturate8(a[159:144]) -tmp_dst[151:144] := Saturate8(a[175:160]) -tmp_dst[159:152] := Saturate8(a[191:176]) -tmp_dst[167:160] := Saturate8(a[207:192]) -tmp_dst[175:168] := Saturate8(a[223:208]) -tmp_dst[183:176] := Saturate8(a[239:224]) -tmp_dst[191:184] := Saturate8(a[255:240]) -tmp_dst[199:192] := Saturate8(b[143:128]) -tmp_dst[207:200] := Saturate8(b[159:144]) -tmp_dst[215:208] := Saturate8(b[175:160]) -tmp_dst[223:216] := Saturate8(b[191:176]) -tmp_dst[231:224] := Saturate8(b[207:192]) -tmp_dst[239:232] := Saturate8(b[223:208]) -tmp_dst[247:240] := Saturate8(b[239:224]) -tmp_dst[255:248] := Saturate8(b[255:240]) +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 i := j*8 IF k[j] @@ -12011,53 +11232,40 @@ FOR j := 0 to 31 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -tmp_dst[135:128] := Saturate8(a[143:128]) -tmp_dst[143:136] := Saturate8(a[159:144]) -tmp_dst[151:144] := Saturate8(a[175:160]) -tmp_dst[159:152] := Saturate8(a[191:176]) -tmp_dst[167:160] := Saturate8(a[207:192]) -tmp_dst[175:168] := Saturate8(a[223:208]) -tmp_dst[183:176] := Saturate8(a[239:224]) -tmp_dst[191:184] := Saturate8(a[255:240]) -tmp_dst[199:192] := Saturate8(b[143:128]) -tmp_dst[207:200] := Saturate8(b[159:144]) -tmp_dst[215:208] := Saturate8(b[175:160]) -tmp_dst[223:216] := Saturate8(b[191:176]) -tmp_dst[231:224] := Saturate8(b[207:192]) -tmp_dst[239:232] := Saturate8(b[223:208]) -tmp_dst[247:240] := Saturate8(b[239:224]) -tmp_dst[255:248] := Saturate8(b[255:240]) +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 i := j*8 IF k[j] @@ -12068,86 +11276,41 @@ FOR j := 0 to 31 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -tmp_dst[135:128] := Saturate8(a[143:128]) -tmp_dst[143:136] := Saturate8(a[159:144]) -tmp_dst[151:144] := Saturate8(a[175:160]) -tmp_dst[159:152] := Saturate8(a[191:176]) -tmp_dst[167:160] := Saturate8(a[207:192]) -tmp_dst[175:168] := Saturate8(a[223:208]) -tmp_dst[183:176] := Saturate8(a[239:224]) -tmp_dst[191:184] := Saturate8(a[255:240]) -tmp_dst[199:192] := Saturate8(b[143:128]) -tmp_dst[207:200] := Saturate8(b[159:144]) -tmp_dst[215:208] := Saturate8(b[175:160]) -tmp_dst[223:216] := Saturate8(b[191:176]) -tmp_dst[231:224] := Saturate8(b[207:192]) -tmp_dst[239:232] := Saturate8(b[223:208]) -tmp_dst[247:240] := Saturate8(b[239:224]) -tmp_dst[255:248] := Saturate8(b[255:240]) -tmp_dst[263:256] := Saturate8(a[271:256]) -tmp_dst[271:264] := Saturate8(a[287:272]) -tmp_dst[279:272] := Saturate8(a[303:288]) -tmp_dst[287:280] := Saturate8(a[319:304]) -tmp_dst[295:288] := Saturate8(a[335:320]) -tmp_dst[303:296] := Saturate8(a[351:336]) -tmp_dst[311:304] := Saturate8(a[367:352]) -tmp_dst[319:312] := Saturate8(a[383:368]) -tmp_dst[327:320] := Saturate8(b[271:256]) -tmp_dst[335:328] := Saturate8(b[287:272]) -tmp_dst[343:336] := Saturate8(b[303:288]) -tmp_dst[351:344] := Saturate8(b[319:304]) -tmp_dst[359:352] := Saturate8(b[335:320]) -tmp_dst[367:360] := Saturate8(b[351:336]) -tmp_dst[375:368] := Saturate8(b[367:352]) -tmp_dst[383:376] := Saturate8(b[383:368]) -tmp_dst[391:384] := Saturate8(a[399:384]) -tmp_dst[399:392] := Saturate8(a[415:400]) -tmp_dst[407:400] := Saturate8(a[431:416]) -tmp_dst[415:408] := Saturate8(a[447:432]) -tmp_dst[423:416] := Saturate8(a[463:448]) -tmp_dst[431:424] := Saturate8(a[479:464]) -tmp_dst[439:432] := Saturate8(a[495:480]) -tmp_dst[447:440] := Saturate8(a[511:496]) -tmp_dst[455:448] := Saturate8(b[399:384]) -tmp_dst[463:456] := Saturate8(b[415:400]) -tmp_dst[471:464] := Saturate8(b[431:416]) -tmp_dst[479:472] := Saturate8(b[447:432]) -tmp_dst[487:480] := Saturate8(b[463:448]) -tmp_dst[495:488] := Saturate8(b[479:464]) -tmp_dst[503:496] := Saturate8(b[495:480]) -tmp_dst[511:504] := Saturate8(b[511:496]) -FOR j := 0 to 63 +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] @@ -12155,87 +11318,42 @@ FOR j := 0 to 63 dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -tmp_dst[135:128] := Saturate8(a[143:128]) -tmp_dst[143:136] := Saturate8(a[159:144]) -tmp_dst[151:144] := Saturate8(a[175:160]) -tmp_dst[159:152] := Saturate8(a[191:176]) -tmp_dst[167:160] := Saturate8(a[207:192]) -tmp_dst[175:168] := Saturate8(a[223:208]) -tmp_dst[183:176] := Saturate8(a[239:224]) -tmp_dst[191:184] := Saturate8(a[255:240]) -tmp_dst[199:192] := Saturate8(b[143:128]) -tmp_dst[207:200] := Saturate8(b[159:144]) -tmp_dst[215:208] := Saturate8(b[175:160]) -tmp_dst[223:216] := Saturate8(b[191:176]) -tmp_dst[231:224] := Saturate8(b[207:192]) -tmp_dst[239:232] := Saturate8(b[223:208]) -tmp_dst[247:240] := Saturate8(b[239:224]) -tmp_dst[255:248] := Saturate8(b[255:240]) -tmp_dst[263:256] := Saturate8(a[271:256]) -tmp_dst[271:264] := Saturate8(a[287:272]) -tmp_dst[279:272] := Saturate8(a[303:288]) -tmp_dst[287:280] := Saturate8(a[319:304]) -tmp_dst[295:288] := Saturate8(a[335:320]) -tmp_dst[303:296] := Saturate8(a[351:336]) -tmp_dst[311:304] := Saturate8(a[367:352]) -tmp_dst[319:312] := Saturate8(a[383:368]) -tmp_dst[327:320] := Saturate8(b[271:256]) -tmp_dst[335:328] := Saturate8(b[287:272]) -tmp_dst[343:336] := Saturate8(b[303:288]) -tmp_dst[351:344] := Saturate8(b[319:304]) -tmp_dst[359:352] := Saturate8(b[335:320]) -tmp_dst[367:360] := Saturate8(b[351:336]) -tmp_dst[375:368] := Saturate8(b[367:352]) -tmp_dst[383:376] := Saturate8(b[383:368]) -tmp_dst[391:384] := Saturate8(a[399:384]) -tmp_dst[399:392] := Saturate8(a[415:400]) -tmp_dst[407:400] := Saturate8(a[431:416]) -tmp_dst[415:408] := Saturate8(a[447:432]) -tmp_dst[423:416] := Saturate8(a[463:448]) -tmp_dst[431:424] := Saturate8(a[479:464]) -tmp_dst[439:432] := Saturate8(a[495:480]) -tmp_dst[447:440] := Saturate8(a[511:496]) -tmp_dst[455:448] := Saturate8(b[399:384]) -tmp_dst[463:456] := Saturate8(b[415:400]) -tmp_dst[471:464] := Saturate8(b[431:416]) -tmp_dst[479:472] := Saturate8(b[447:432]) -tmp_dst[487:480] := Saturate8(b[463:448]) -tmp_dst[495:488] := Saturate8(b[479:464]) -tmp_dst[503:496] := Saturate8(b[495:480]) -tmp_dst[511:504] := Saturate8(b[511:496]) -FOR j := 0 to 63 +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := tmp_dst[i+7:i] @@ -12243,203 +11361,108 @@ FOR j := 0 to 63 dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". - -dst[7:0] := Saturate8(a[15:0]) -dst[15:8] := Saturate8(a[31:16]) -dst[23:16] := Saturate8(a[47:32]) -dst[31:24] := Saturate8(a[63:48]) -dst[39:32] := Saturate8(a[79:64]) -dst[47:40] := Saturate8(a[95:80]) -dst[55:48] := Saturate8(a[111:96]) -dst[63:56] := Saturate8(a[127:112]) -dst[71:64] := Saturate8(b[15:0]) -dst[79:72] := Saturate8(b[31:16]) -dst[87:80] := Saturate8(b[47:32]) -dst[95:88] := Saturate8(b[63:48]) -dst[103:96] := Saturate8(b[79:64]) -dst[111:104] := Saturate8(b[95:80]) -dst[119:112] := Saturate8(b[111:96]) -dst[127:120] := Saturate8(b[127:112]) -dst[135:128] := Saturate8(a[143:128]) -dst[143:136] := Saturate8(a[159:144]) -dst[151:144] := Saturate8(a[175:160]) -dst[159:152] := Saturate8(a[191:176]) -dst[167:160] := Saturate8(a[207:192]) -dst[175:168] := Saturate8(a[223:208]) -dst[183:176] := Saturate8(a[239:224]) -dst[191:184] := Saturate8(a[255:240]) -dst[199:192] := Saturate8(b[143:128]) -dst[207:200] := Saturate8(b[159:144]) -dst[215:208] := Saturate8(b[175:160]) -dst[223:216] := Saturate8(b[191:176]) -dst[231:224] := Saturate8(b[207:192]) -dst[239:232] := Saturate8(b[223:208]) -dst[247:240] := Saturate8(b[239:224]) -dst[255:248] := Saturate8(b[255:240]) -dst[263:256] := Saturate8(a[271:256]) -dst[271:264] := Saturate8(a[287:272]) -dst[279:272] := Saturate8(a[303:288]) -dst[287:280] := Saturate8(a[319:304]) -dst[295:288] := Saturate8(a[335:320]) -dst[303:296] := Saturate8(a[351:336]) -dst[311:304] := Saturate8(a[367:352]) -dst[319:312] := Saturate8(a[383:368]) -dst[327:320] := Saturate8(b[271:256]) -dst[335:328] := Saturate8(b[287:272]) -dst[343:336] := Saturate8(b[303:288]) -dst[351:344] := Saturate8(b[319:304]) -dst[359:352] := Saturate8(b[335:320]) -dst[367:360] := Saturate8(b[351:336]) -dst[375:368] := Saturate8(b[367:352]) -dst[383:376] := Saturate8(b[383:368]) -dst[391:384] := Saturate8(a[399:384]) -dst[399:392] := Saturate8(a[415:400]) -dst[407:400] := Saturate8(a[431:416]) -dst[415:408] := Saturate8(a[447:432]) -dst[423:416] := Saturate8(a[463:448]) -dst[431:424] := Saturate8(a[479:464]) -dst[439:432] := Saturate8(a[495:480]) -dst[447:440] := Saturate8(a[511:496]) -dst[455:448] := Saturate8(b[399:384]) -dst[463:456] := Saturate8(b[415:400]) -dst[471:464] := Saturate8(b[431:416]) -dst[479:472] := Saturate8(b[447:432]) -dst[487:480] := Saturate8(b[463:448]) -dst[495:488] := Saturate8(b[479:464]) -dst[503:496] := Saturate8(b[495:480]) -dst[511:504] := Saturate8(b[511:496]) -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Convert +
immintrin.h
Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) FOR j := 0 to 15 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) FOR j := 0 to 15 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -tmp_dst[143:128] := SaturateU16(a[159:128]) -tmp_dst[159:144] := SaturateU16(a[191:160]) -tmp_dst[175:160] := SaturateU16(a[223:192]) -tmp_dst[191:176] := SaturateU16(a[255:224]) -tmp_dst[207:192] := SaturateU16(b[159:128]) -tmp_dst[223:208] := SaturateU16(b[191:160]) -tmp_dst[239:224] := SaturateU16(b[223:192]) -tmp_dst[255:240] := SaturateU16(b[255:224]) -FOR j := 0 to 15 +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] @@ -12447,40 +11470,34 @@ FOR j := 0 to 15 dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -tmp_dst[143:128] := SaturateU16(a[159:128]) -tmp_dst[159:144] := SaturateU16(a[191:160]) -tmp_dst[175:160] := SaturateU16(a[223:192]) -tmp_dst[191:176] := SaturateU16(a[255:224]) -tmp_dst[207:192] := SaturateU16(b[159:128]) -tmp_dst[223:208] := SaturateU16(b[191:160]) -tmp_dst[239:224] := SaturateU16(b[223:192]) -tmp_dst[255:240] := SaturateU16(b[255:224]) -FOR j := 0 to 15 +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := tmp_dst[i+15:i] @@ -12488,192 +11505,283 @@ FOR j := 0 to 15 dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -tmp_dst[143:128] := SaturateU16(a[159:128]) -tmp_dst[159:144] := SaturateU16(a[191:160]) -tmp_dst[175:160] := SaturateU16(a[223:192]) -tmp_dst[191:176] := SaturateU16(a[255:224]) -tmp_dst[207:192] := SaturateU16(b[159:128]) -tmp_dst[223:208] := SaturateU16(b[191:160]) -tmp_dst[239:224] := SaturateU16(b[223:192]) -tmp_dst[255:240] := SaturateU16(b[255:224]) -tmp_dst[271:256] := SaturateU16(a[287:256]) -tmp_dst[287:272] := SaturateU16(a[319:288]) -tmp_dst[303:288] := SaturateU16(a[351:320]) -tmp_dst[319:304] := SaturateU16(a[383:352]) -tmp_dst[335:320] := SaturateU16(b[287:256]) -tmp_dst[351:336] := SaturateU16(b[319:288]) -tmp_dst[367:352] := SaturateU16(b[351:320]) -tmp_dst[383:368] := SaturateU16(b[383:352]) -tmp_dst[399:384] := SaturateU16(a[415:384]) -tmp_dst[415:400] := SaturateU16(a[447:416]) -tmp_dst[431:416] := SaturateU16(a[479:448]) -tmp_dst[447:432] := SaturateU16(a[511:480]) -tmp_dst[463:448] := SaturateU16(b[415:384]) -tmp_dst[479:464] := SaturateU16(b[447:416]) -tmp_dst[495:480] := SaturateU16(b[479:448]) -tmp_dst[511:496] := SaturateU16(b[511:480]) +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -tmp_dst[143:128] := SaturateU16(a[159:128]) -tmp_dst[159:144] := SaturateU16(a[191:160]) -tmp_dst[175:160] := SaturateU16(a[223:192]) -tmp_dst[191:176] := SaturateU16(a[255:224]) -tmp_dst[207:192] := SaturateU16(b[159:128]) -tmp_dst[223:208] := SaturateU16(b[191:160]) -tmp_dst[239:224] := SaturateU16(b[223:192]) -tmp_dst[255:240] := SaturateU16(b[255:224]) -tmp_dst[271:256] := SaturateU16(a[287:256]) -tmp_dst[287:272] := SaturateU16(a[319:288]) -tmp_dst[303:288] := SaturateU16(a[351:320]) -tmp_dst[319:304] := SaturateU16(a[383:352]) -tmp_dst[335:320] := SaturateU16(b[287:256]) -tmp_dst[351:336] := SaturateU16(b[319:288]) -tmp_dst[367:352] := SaturateU16(b[351:320]) -tmp_dst[383:368] := SaturateU16(b[383:352]) -tmp_dst[399:384] := SaturateU16(a[415:384]) -tmp_dst[415:400] := SaturateU16(a[447:416]) -tmp_dst[431:416] := SaturateU16(a[479:448]) -tmp_dst[447:432] := SaturateU16(a[511:480]) -tmp_dst[463:448] := SaturateU16(b[415:384]) -tmp_dst[479:464] := SaturateU16(b[447:416]) -tmp_dst[495:480] := SaturateU16(b[479:448]) -tmp_dst[511:496] := SaturateU16(b[511:480]) +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Miscellaneous + + + + + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". +
+ + + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[15:0] := SaturateU16(a[31:0]) -dst[31:16] := SaturateU16(a[63:32]) -dst[47:32] := SaturateU16(a[95:64]) -dst[63:48] := SaturateU16(a[127:96]) -dst[79:64] := SaturateU16(b[31:0]) -dst[95:80] := SaturateU16(b[63:32]) -dst[111:96] := SaturateU16(b[95:64]) -dst[127:112] := SaturateU16(b[127:96]) -dst[143:128] := SaturateU16(a[159:128]) -dst[159:144] := SaturateU16(a[191:160]) -dst[175:160] := SaturateU16(a[223:192]) -dst[191:176] := SaturateU16(a[255:224]) -dst[207:192] := SaturateU16(b[159:128]) -dst[223:208] := SaturateU16(b[191:160]) -dst[239:224] := SaturateU16(b[223:192]) -dst[255:240] := SaturateU16(b[255:224]) -dst[271:256] := SaturateU16(a[287:256]) -dst[287:272] := SaturateU16(a[319:288]) -dst[303:288] := SaturateU16(a[351:320]) -dst[319:304] := SaturateU16(a[383:352]) -dst[335:320] := SaturateU16(b[287:256]) -dst[351:336] := SaturateU16(b[319:288]) -dst[367:352] := SaturateU16(b[351:320]) -dst[383:368] := SaturateU16(b[383:352]) -dst[399:384] := SaturateU16(a[415:384]) -dst[415:400] := SaturateU16(a[447:416]) -dst[431:416] := SaturateU16(a[479:448]) -dst[447:432] := SaturateU16(a[511:480]) -dst[463:448] := SaturateU16(b[415:384]) -dst[479:464] := SaturateU16(b[447:416]) -dst[495:480] := SaturateU16(b[479:448]) -dst[511:496] := SaturateU16(b[511:480]) -dst[MAX:512] := 0 +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Miscellaneous + + + + + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) FOR j := 0 to 7 i := j*16 IF k[j] @@ -12684,29 +11792,31 @@ FOR j := 0 to 7 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert + AVX512VL +
immintrin.h
Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) FOR j := 0 to 7 i := j*16 IF k[j] @@ -12717,1644 +11827,1334 @@ FOR j := 0 to 7 ENDFOR dst[MAX:128] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Miscellaneous + + + + + + + Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512BW - Convert - Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -tmp_dst[135:128] := SaturateU8(a[143:128]) -tmp_dst[143:136] := SaturateU8(a[159:144]) -tmp_dst[151:144] := SaturateU8(a[175:160]) -tmp_dst[159:152] := SaturateU8(a[191:176]) -tmp_dst[167:160] := SaturateU8(a[207:192]) -tmp_dst[175:168] := SaturateU8(a[223:208]) -tmp_dst[183:176] := SaturateU8(a[239:224]) -tmp_dst[191:184] := SaturateU8(a[255:240]) -tmp_dst[199:192] := SaturateU8(b[143:128]) -tmp_dst[207:200] := SaturateU8(b[159:144]) -tmp_dst[215:208] := SaturateU8(b[175:160]) -tmp_dst[223:216] := SaturateU8(b[191:176]) -tmp_dst[231:224] := SaturateU8(b[207:192]) -tmp_dst[239:232] := SaturateU8(b[223:208]) -tmp_dst[247:240] := SaturateU8(b[239:224]) -tmp_dst[255:248] := SaturateU8(b[255:240]) FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -tmp_dst[135:128] := SaturateU8(a[143:128]) -tmp_dst[143:136] := SaturateU8(a[159:144]) -tmp_dst[151:144] := SaturateU8(a[175:160]) -tmp_dst[159:152] := SaturateU8(a[191:176]) -tmp_dst[167:160] := SaturateU8(a[207:192]) -tmp_dst[175:168] := SaturateU8(a[223:208]) -tmp_dst[183:176] := SaturateU8(a[239:224]) -tmp_dst[191:184] := SaturateU8(a[255:240]) -tmp_dst[199:192] := SaturateU8(b[143:128]) -tmp_dst[207:200] := SaturateU8(b[159:144]) -tmp_dst[215:208] := SaturateU8(b[175:160]) -tmp_dst[223:216] := SaturateU8(b[191:176]) -tmp_dst[231:224] := SaturateU8(b[207:192]) -tmp_dst[239:232] := SaturateU8(b[223:208]) -tmp_dst[247:240] := SaturateU8(b[239:224]) -tmp_dst[255:248] := SaturateU8(b[255:240]) FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -tmp_dst[135:128] := SaturateU8(a[143:128]) -tmp_dst[143:136] := SaturateU8(a[159:144]) -tmp_dst[151:144] := SaturateU8(a[175:160]) -tmp_dst[159:152] := SaturateU8(a[191:176]) -tmp_dst[167:160] := SaturateU8(a[207:192]) -tmp_dst[175:168] := SaturateU8(a[223:208]) -tmp_dst[183:176] := SaturateU8(a[239:224]) -tmp_dst[191:184] := SaturateU8(a[255:240]) -tmp_dst[199:192] := SaturateU8(b[143:128]) -tmp_dst[207:200] := SaturateU8(b[159:144]) -tmp_dst[215:208] := SaturateU8(b[175:160]) -tmp_dst[223:216] := SaturateU8(b[191:176]) -tmp_dst[231:224] := SaturateU8(b[207:192]) -tmp_dst[239:232] := SaturateU8(b[223:208]) -tmp_dst[247:240] := SaturateU8(b[239:224]) -tmp_dst[255:248] := SaturateU8(b[255:240]) -tmp_dst[263:256] := SaturateU8(a[271:256]) -tmp_dst[271:264] := SaturateU8(a[287:272]) -tmp_dst[279:272] := SaturateU8(a[303:288]) -tmp_dst[287:280] := SaturateU8(a[319:304]) -tmp_dst[295:288] := SaturateU8(a[335:320]) -tmp_dst[303:296] := SaturateU8(a[351:336]) -tmp_dst[311:304] := SaturateU8(a[367:352]) -tmp_dst[319:312] := SaturateU8(a[383:368]) -tmp_dst[327:320] := SaturateU8(b[271:256]) -tmp_dst[335:328] := SaturateU8(b[287:272]) -tmp_dst[343:336] := SaturateU8(b[303:288]) -tmp_dst[351:344] := SaturateU8(b[319:304]) -tmp_dst[359:352] := SaturateU8(b[335:320]) -tmp_dst[367:360] := SaturateU8(b[351:336]) -tmp_dst[375:368] := SaturateU8(b[367:352]) -tmp_dst[383:376] := SaturateU8(b[383:368]) -tmp_dst[391:384] := SaturateU8(a[399:384]) -tmp_dst[399:392] := SaturateU8(a[415:400]) -tmp_dst[407:400] := SaturateU8(a[431:416]) -tmp_dst[415:408] := SaturateU8(a[447:432]) -tmp_dst[423:416] := SaturateU8(a[463:448]) -tmp_dst[431:424] := SaturateU8(a[479:464]) -tmp_dst[439:432] := SaturateU8(a[495:480]) -tmp_dst[447:440] := SaturateU8(a[511:496]) -tmp_dst[455:448] := SaturateU8(b[399:384]) -tmp_dst[463:456] := SaturateU8(b[415:400]) -tmp_dst[471:464] := SaturateU8(b[431:416]) -tmp_dst[479:472] := SaturateU8(b[447:432]) -tmp_dst[487:480] := SaturateU8(b[463:448]) -tmp_dst[495:488] := SaturateU8(b[479:464]) -tmp_dst[503:496] := SaturateU8(b[495:480]) -tmp_dst[511:504] := SaturateU8(b[511:496]) -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -tmp_dst[135:128] := SaturateU8(a[143:128]) -tmp_dst[143:136] := SaturateU8(a[159:144]) -tmp_dst[151:144] := SaturateU8(a[175:160]) -tmp_dst[159:152] := SaturateU8(a[191:176]) -tmp_dst[167:160] := SaturateU8(a[207:192]) -tmp_dst[175:168] := SaturateU8(a[223:208]) -tmp_dst[183:176] := SaturateU8(a[239:224]) -tmp_dst[191:184] := SaturateU8(a[255:240]) -tmp_dst[199:192] := SaturateU8(b[143:128]) -tmp_dst[207:200] := SaturateU8(b[159:144]) -tmp_dst[215:208] := SaturateU8(b[175:160]) -tmp_dst[223:216] := SaturateU8(b[191:176]) -tmp_dst[231:224] := SaturateU8(b[207:192]) -tmp_dst[239:232] := SaturateU8(b[223:208]) -tmp_dst[247:240] := SaturateU8(b[239:224]) -tmp_dst[255:248] := SaturateU8(b[255:240]) -tmp_dst[263:256] := SaturateU8(a[271:256]) -tmp_dst[271:264] := SaturateU8(a[287:272]) -tmp_dst[279:272] := SaturateU8(a[303:288]) -tmp_dst[287:280] := SaturateU8(a[319:304]) -tmp_dst[295:288] := SaturateU8(a[335:320]) -tmp_dst[303:296] := SaturateU8(a[351:336]) -tmp_dst[311:304] := SaturateU8(a[367:352]) -tmp_dst[319:312] := SaturateU8(a[383:368]) -tmp_dst[327:320] := SaturateU8(b[271:256]) -tmp_dst[335:328] := SaturateU8(b[287:272]) -tmp_dst[343:336] := SaturateU8(b[303:288]) -tmp_dst[351:344] := SaturateU8(b[319:304]) -tmp_dst[359:352] := SaturateU8(b[335:320]) -tmp_dst[367:360] := SaturateU8(b[351:336]) -tmp_dst[375:368] := SaturateU8(b[367:352]) -tmp_dst[383:376] := SaturateU8(b[383:368]) -tmp_dst[391:384] := SaturateU8(a[399:384]) -tmp_dst[399:392] := SaturateU8(a[415:400]) -tmp_dst[407:400] := SaturateU8(a[431:416]) -tmp_dst[415:408] := SaturateU8(a[447:432]) -tmp_dst[423:416] := SaturateU8(a[463:448]) -tmp_dst[431:424] := SaturateU8(a[479:464]) -tmp_dst[439:432] := SaturateU8(a[495:480]) -tmp_dst[447:440] := SaturateU8(a[511:496]) -tmp_dst[455:448] := SaturateU8(b[399:384]) -tmp_dst[463:456] := SaturateU8(b[415:400]) -tmp_dst[471:464] := SaturateU8(b[431:416]) -tmp_dst[479:472] := SaturateU8(b[447:432]) -tmp_dst[487:480] := SaturateU8(b[463:448]) -tmp_dst[495:488] := SaturateU8(b[479:464]) -tmp_dst[503:496] := SaturateU8(b[495:480]) -tmp_dst[511:504] := SaturateU8(b[511:496]) -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Load + + + + + Load 256-bits (composed of 16 packed 16-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + AVX512BW - Convert - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 32 packed 8-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(a[79:64]) -dst[47:40] := SaturateU8(a[95:80]) -dst[55:48] := SaturateU8(a[111:96]) -dst[63:56] := SaturateU8(a[127:112]) -dst[71:64] := SaturateU8(b[15:0]) -dst[79:72] := SaturateU8(b[31:16]) -dst[87:80] := SaturateU8(b[47:32]) -dst[95:88] := SaturateU8(b[63:48]) -dst[103:96] := SaturateU8(b[79:64]) -dst[111:104] := SaturateU8(b[95:80]) -dst[119:112] := SaturateU8(b[111:96]) -dst[127:120] := SaturateU8(b[127:112]) -dst[135:128] := SaturateU8(a[143:128]) -dst[143:136] := SaturateU8(a[159:144]) -dst[151:144] := SaturateU8(a[175:160]) -dst[159:152] := SaturateU8(a[191:176]) -dst[167:160] := SaturateU8(a[207:192]) -dst[175:168] := SaturateU8(a[223:208]) -dst[183:176] := SaturateU8(a[239:224]) -dst[191:184] := SaturateU8(a[255:240]) -dst[199:192] := SaturateU8(b[143:128]) -dst[207:200] := SaturateU8(b[159:144]) -dst[215:208] := SaturateU8(b[175:160]) -dst[223:216] := SaturateU8(b[191:176]) -dst[231:224] := SaturateU8(b[207:192]) -dst[239:232] := SaturateU8(b[223:208]) -dst[247:240] := SaturateU8(b[239:224]) -dst[255:248] := SaturateU8(b[255:240]) -dst[263:256] := SaturateU8(a[271:256]) -dst[271:264] := SaturateU8(a[287:272]) -dst[279:272] := SaturateU8(a[303:288]) -dst[287:280] := SaturateU8(a[319:304]) -dst[295:288] := SaturateU8(a[335:320]) -dst[303:296] := SaturateU8(a[351:336]) -dst[311:304] := SaturateU8(a[367:352]) -dst[319:312] := SaturateU8(a[383:368]) -dst[327:320] := SaturateU8(b[271:256]) -dst[335:328] := SaturateU8(b[287:272]) -dst[343:336] := SaturateU8(b[303:288]) -dst[351:344] := SaturateU8(b[319:304]) -dst[359:352] := SaturateU8(b[335:320]) -dst[367:360] := SaturateU8(b[351:336]) -dst[375:368] := SaturateU8(b[367:352]) -dst[383:376] := SaturateU8(b[383:368]) -dst[391:384] := SaturateU8(a[399:384]) -dst[399:392] := SaturateU8(a[415:400]) -dst[407:400] := SaturateU8(a[431:416]) -dst[415:408] := SaturateU8(a[447:432]) -dst[423:416] := SaturateU8(a[463:448]) -dst[431:424] := SaturateU8(a[479:464]) -dst[439:432] := SaturateU8(a[495:480]) -dst[447:440] := SaturateU8(a[511:496]) -dst[455:448] := SaturateU8(b[399:384]) -dst[463:456] := SaturateU8(b[415:400]) -dst[471:464] := SaturateU8(b[431:416]) -dst[479:472] := SaturateU8(b[447:432]) -dst[487:480] := SaturateU8(b[463:448]) -dst[495:488] := SaturateU8(b[479:464]) -dst[503:496] := SaturateU8(b[495:480]) -dst[511:504] := SaturateU8(b[511:496]) -dst[MAX:512] := 0 +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Load + + + + + Load 128-bits (composed of 8 packed 16-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512BW AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 16 packed 8-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + AVX512BW - Convert - Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) FOR j := 0 to 15 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+15:i] := a[i+15:i] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) FOR j := 0 to 15 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+15:i] := a[i+15:i] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] + dst[i+15:i] := a[i+15:i] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] + dst[i+15:i] := a[i+15:i] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW - Arithmetic - - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Move + + + + + + + Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] + dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] + dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] + dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] + dst[i+7:i] := a[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Move + + + + + + + Store packed 16-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*16 + IF k[j] + MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] + FI +ENDFOR + + + AVX512BW AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 16-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*16 + IF k[j] + MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] + FI +ENDFOR + + AVX512BW - Arithmetic - - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 8-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := src[i+7:i] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 8-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := 0 + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] FI ENDFOR -dst[MAX:256] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Store + + + + + + Store 256-bits (composed of 16 packed 16-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX512BW - Arithmetic - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 32 packed 8-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) -ENDFOR -dst[MAX:512] := 0 +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512BW + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 8 packed 16-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Store + + + + + + Store 128-bits (composed of 16 packed 8-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + AVX512BW - Arithmetic - - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := ABS(a[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW Arithmetic - - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW Arithmetic - - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW Arithmetic - - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW Arithmetic - - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - tmp_dst[i+127:i] := tmp[127:0] -ENDFOR -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Miscellaneous - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - tmp_dst[i+127:i] := tmp[127:0] -ENDFOR -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". - -FOR j := 0 to 3 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - dst[i+127:i] := tmp[127:0] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW - Miscellaneous - - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - tmp_dst[i+127:i] := tmp[127:0] -ENDFOR -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - tmp_dst[i+127:i] := tmp[127:0] -ENDFOR -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Miscellaneous - - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 @@ -14367,18 +13167,17 @@ FOR j := 0 to 31 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 @@ -14391,39 +13190,21 @@ FOR j := 0 to 31 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW Arithmetic - - - - - + + + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 @@ -14431,22 +13212,22 @@ FOR j := 0 to 63 dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 IF k[j] dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 @@ -14454,73 +13235,70 @@ FOR j := 0 to 63 dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 @@ -14528,23 +13306,22 @@ FOR j := 0 to 15 dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 @@ -14552,3184 +13329,3486 @@ FOR j := 0 to 15 dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW Arithmetic - - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Arithmetic - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512BW - Miscellaneous - - - - - Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+7:i] := b[i+7:i] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) ELSE - dst[i+7:i] := a[i+7:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Arithmetic + + + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512BW - Miscellaneous - - - - - Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+7:i] := b[i+7:i] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) ELSE - dst[i+7:i] := a[i+7:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Arithmetic + + + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512BW - Miscellaneous - - - - - Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := b[i+7:i] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ELSE - dst[i+7:i] := a[i+7:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512BW - Miscellaneous - - - - - Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := b[i+15:i] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE - dst[i+15:i] := a[i+15:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := b[i+15:i] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE - dst[i+15:i] := a[i+15:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := b[i+15:i] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE - dst[i+15:i] := a[i+15:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Set - - - - - Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Set - - - - Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ELSE - dst[i+7:i] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - dst[i+7:i] := a[7:0] + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Set - - - - - Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Set - - - - Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Set - - - - - Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*8 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE - dst[i+7:i] := src[i+7:i] + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE - dst[i+7:i] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Set - - - - Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 IF k[j] - dst[i+7:i] := a[7:0] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE dst[i+7:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Set - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Set - - - - Broadcast 16-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := a[15:0] + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Set - - - - - Broadcast 16-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Arithmetic + + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512BW - Miscellaneous - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Set - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Set - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Set - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := a[15:0] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] + ELSE + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] + ELSE + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] ELSE - k[j] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] ELSE - k[j] := 0 + dst[i+15:i] := 0 FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] ELSE - k[j] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] ELSE - k[j] := 0 + dst[i+15:i] := 0 FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] ELSE - k[j] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] ELSE - k[j] := 0 + dst[i+15:i] := 0 FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] ELSE - k[j] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - Mask - AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + Arithmetic + + + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] + ELSE + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := 0 + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := 0 + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 63 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) ELSE - k[j] := 0 - FI + dst[i+7:i] := 0 + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) ELSE - k[j] := 0 - FI + dst[i+7:i] := src[i+7:i] + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) ELSE - k[j] := 0 - FI + dst[i+7:i] := 0 + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) ELSE - k[j] := 0 - FI + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) ELSE - k[j] := 0 - FI + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) ELSE - k[j] := 0 - FI + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) ELSE - k[j] := 0 - FI + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) ELSE - k[j] := 0 - FI + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE - k[j] := 0 - FI + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE - k[j] := 0 - FI + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE - k[j] := 0 - FI + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE - k[j] := 0 - FI + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[143:128] := Saturate16(a[159:128]) +tmp_dst[159:144] := Saturate16(a[191:160]) +tmp_dst[175:160] := Saturate16(a[223:192]) +tmp_dst[191:176] := Saturate16(a[255:224]) +tmp_dst[207:192] := Saturate16(b[159:128]) +tmp_dst[223:208] := Saturate16(b[191:160]) +tmp_dst[239:224] := Saturate16(b[223:192]) +tmp_dst[255:240] := Saturate16(b[255:224]) FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] ELSE - k[j] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[143:128] := Saturate16(a[159:128]) +tmp_dst[159:144] := Saturate16(a[191:160]) +tmp_dst[175:160] := Saturate16(a[223:192]) +tmp_dst[191:176] := Saturate16(a[255:224]) +tmp_dst[207:192] := Saturate16(b[159:128]) +tmp_dst[223:208] := Saturate16(b[191:160]) +tmp_dst[239:224] := Saturate16(b[223:192]) +tmp_dst[255:240] := Saturate16(b[255:224]) FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] ELSE - k[j] := 0 + dst[i+15:i] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL - AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + Convert + + + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +tmp_dst[135:128] := Saturate8(a[143:128]) +tmp_dst[143:136] := Saturate8(a[159:144]) +tmp_dst[151:144] := Saturate8(a[175:160]) +tmp_dst[159:152] := Saturate8(a[191:176]) +tmp_dst[167:160] := Saturate8(a[207:192]) +tmp_dst[175:168] := Saturate8(a[223:208]) +tmp_dst[183:176] := Saturate8(a[239:224]) +tmp_dst[191:184] := Saturate8(a[255:240]) +tmp_dst[199:192] := Saturate8(b[143:128]) +tmp_dst[207:200] := Saturate8(b[159:144]) +tmp_dst[215:208] := Saturate8(b[175:160]) +tmp_dst[223:216] := Saturate8(b[191:176]) +tmp_dst[231:224] := Saturate8(b[207:192]) +tmp_dst[239:232] := Saturate8(b[223:208]) +tmp_dst[247:240] := Saturate8(b[239:224]) +tmp_dst[255:248] := Saturate8(b[255:240]) FOR j := 0 to 31 i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +tmp_dst[135:128] := Saturate8(a[143:128]) +tmp_dst[143:136] := Saturate8(a[159:144]) +tmp_dst[151:144] := Saturate8(a[175:160]) +tmp_dst[159:152] := Saturate8(a[191:176]) +tmp_dst[167:160] := Saturate8(a[207:192]) +tmp_dst[175:168] := Saturate8(a[223:208]) +tmp_dst[183:176] := Saturate8(a[239:224]) +tmp_dst[191:184] := Saturate8(a[255:240]) +tmp_dst[199:192] := Saturate8(b[143:128]) +tmp_dst[207:200] := Saturate8(b[159:144]) +tmp_dst[215:208] := Saturate8(b[175:160]) +tmp_dst[223:216] := Saturate8(b[191:176]) +tmp_dst[231:224] := Saturate8(b[207:192]) +tmp_dst[239:232] := Saturate8(b[223:208]) +tmp_dst[247:240] := Saturate8(b[239:224]) +tmp_dst[255:248] := Saturate8(b[255:240]) FOR j := 0 to 31 i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +FOR j := 0 to 15 i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - k[j] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +FOR j := 0 to 15 i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - k[j] := 0 + dst[i+7:i] := 0 FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +tmp_dst[143:128] := SaturateU16(a[159:128]) +tmp_dst[159:144] := SaturateU16(a[191:160]) +tmp_dst[175:160] := SaturateU16(a[223:192]) +tmp_dst[191:176] := SaturateU16(a[255:224]) +tmp_dst[207:192] := SaturateU16(b[159:128]) +tmp_dst[223:208] := SaturateU16(b[191:160]) +tmp_dst[239:224] := SaturateU16(b[223:192]) +tmp_dst[255:240] := SaturateU16(b[255:224]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] ELSE - k[j] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +tmp_dst[143:128] := SaturateU16(a[159:128]) +tmp_dst[159:144] := SaturateU16(a[191:160]) +tmp_dst[175:160] := SaturateU16(a[223:192]) +tmp_dst[191:176] := SaturateU16(a[255:224]) +tmp_dst[207:192] := SaturateU16(b[159:128]) +tmp_dst[223:208] := SaturateU16(b[191:160]) +tmp_dst[239:224] := SaturateU16(b[223:192]) +tmp_dst[255:240] := SaturateU16(b[255:224]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] ELSE - k[j] := 0 + dst[i+15:i] := 0 FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] ELSE - k[j] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] ELSE - k[j] := 0 + dst[i+15:i] := 0 FI ENDFOR -k[MAX:32] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +tmp_dst[135:128] := SaturateU8(a[143:128]) +tmp_dst[143:136] := SaturateU8(a[159:144]) +tmp_dst[151:144] := SaturateU8(a[175:160]) +tmp_dst[159:152] := SaturateU8(a[191:176]) +tmp_dst[167:160] := SaturateU8(a[207:192]) +tmp_dst[175:168] := SaturateU8(a[223:208]) +tmp_dst[183:176] := SaturateU8(a[239:224]) +tmp_dst[191:184] := SaturateU8(a[255:240]) +tmp_dst[199:192] := SaturateU8(b[143:128]) +tmp_dst[207:200] := SaturateU8(b[159:144]) +tmp_dst[215:208] := SaturateU8(b[175:160]) +tmp_dst[223:216] := SaturateU8(b[191:176]) +tmp_dst[231:224] := SaturateU8(b[207:192]) +tmp_dst[239:232] := SaturateU8(b[223:208]) +tmp_dst[247:240] := SaturateU8(b[239:224]) +tmp_dst[255:248] := SaturateU8(b[255:240]) FOR j := 0 to 31 i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - k[j] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR -k[MAX:32] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - Mask - AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + Convert + + + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +tmp_dst[135:128] := SaturateU8(a[143:128]) +tmp_dst[143:136] := SaturateU8(a[159:144]) +tmp_dst[151:144] := SaturateU8(a[175:160]) +tmp_dst[159:152] := SaturateU8(a[191:176]) +tmp_dst[167:160] := SaturateU8(a[207:192]) +tmp_dst[175:168] := SaturateU8(a[223:208]) +tmp_dst[183:176] := SaturateU8(a[239:224]) +tmp_dst[191:184] := SaturateU8(a[255:240]) +tmp_dst[199:192] := SaturateU8(b[143:128]) +tmp_dst[207:200] := SaturateU8(b[159:144]) +tmp_dst[215:208] := SaturateU8(b[175:160]) +tmp_dst[223:216] := SaturateU8(b[191:176]) +tmp_dst[231:224] := SaturateU8(b[207:192]) +tmp_dst[239:232] := SaturateU8(b[223:208]) +tmp_dst[247:240] := SaturateU8(b[239:224]) +tmp_dst[255:248] := SaturateU8(b[255:240]) +FOR j := 0 to 31 i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +FOR j := 0 to 15 i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +FOR j := 0 to 15 i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := 16*j + l := 8*j + dst[l+7:l] := Saturate8(a[i+15:i]) ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) + FI ENDFOR -k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) ELSE - k[j] := 0 + dst[l+7:l] := 0 FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer - Mask + Convert + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 16*j + l := 8*j + dst[l+7:l] := Saturate8(a[i+15:i]) +ENDFOR +dst[MAX:64] := 0 + + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) ELSE - k[j] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -k[MAX:64] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) FI ENDFOR -k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) ELSE - k[j] := 0 + dst[l+7:l] := 0 FI ENDFOR -k[MAX:64] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) ELSE - k[j] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR -k[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) ELSE - k[j] := 0 + dst[l+15:l] := 0 FI ENDFOR -k[MAX:64] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 +FOR j := 0 to 7 i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) ELSE - k[j] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + i := 16*j + l := 8*j + dst[l+7:l] := SaturateU8(a[i+15:i]) ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) + FI ENDFOR -k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + dst[l+7:l] := SaturateU8(a[i+15:i]) ENDFOR -k[MAX:16] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) ELSE - k[j] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -k[MAX:16] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) FI ENDFOR -k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) ELSE - k[j] := 0 + dst[l+7:l] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + i := 16*j + l := 8*j + dst[l+7:l] := Truncate8(a[i+15:i]) ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) ELSE - k[j] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) FI ENDFOR -k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) ELSE - k[j] := 0 + dst[l+7:l] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512BW - Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
immintrin.h
+ Convert +
+ + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + dst[l+7:l] := Truncate8(a[i+15:i]) ENDFOR -k[MAX:16] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) + FI ENDFOR -k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + + AVX512BW AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512BW - Compare - - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer - Mask + Convert + + + + + + + Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512BW - Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := src[i+7:i] FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := src[i+15:i] FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 16-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := src[i+15:i] FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW - Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -17741,139 +16820,132 @@ k[MAX:16] := 0 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -17885,173 +16957,165 @@ k[MAX:32] := 0 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 31 - i := j*16 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -18062,147 +17126,133 @@ k[MAX:32] := 0 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -18213,180 +17263,166 @@ k[MAX:8] := 0 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -18397,147 +17433,133 @@ k[MAX:8] := 0 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -18548,179 +17570,166 @@ k[MAX:16] := 0 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -18731,140 +17740,133 @@ k[MAX:16] := 0 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -18875,174 +17877,166 @@ k[MAX:32] := 0 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + i := j*8 IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:32] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -19053,147 +18047,133 @@ k[MAX:32] := 0 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). CASE (imm8[2:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT @@ -19204,7 +18184,7 @@ k[MAX:8] := 0 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 @@ -19212,24 +18192,22 @@ FOR j := 0 to 7 k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 @@ -19237,24 +18215,22 @@ FOR j := 0 to 7 k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 @@ -19262,24 +18238,22 @@ FOR j := 0 to 7 k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 @@ -19287,24 +18261,22 @@ FOR j := 0 to 7 k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 @@ -19312,24 +18284,22 @@ FOR j := 0 to 7 k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 @@ -19337,24 +18307,22 @@ FOR j := 0 to 7 k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW + AVX512VL +
immintrin.h
Compare - - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 IF k1[j] k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 @@ -19362,3079 +18330,3000 @@ FOR j := 0 to 7 k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 - IF k[j] - off := 16*idx[i+3:i] - dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := idx[i+15:i] - FI + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 - IF k[j] - off := 16*idx[i+3:i] - dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := a[i+15:i] - FI + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 - IF k[j] - off := 16*idx[i+3:i] - dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := 0 - FI + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 - off := 16*idx[i+3:i] - dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 - IF k[j] - off := 16*idx[i+4:i] - dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := idx[i+15:i] - FI + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 - IF k[j] - off := 16*idx[i+4:i] - dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := a[i+15:i] - FI + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 i := j*16 - IF k[j] - off := 16*idx[i+4:i] - dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := 0 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 - off := 16*idx[i+4:i] - dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 - IF k[j] - off := 16*idx[i+2:i] - dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := idx[i+15:i] + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 - IF k[j] - off := 16*idx[i+2:i] - dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := a[i+15:i] + IF k1[j] + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 - IF k[j] - off := 16*idx[i+2:i] - dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := 0 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 - off := 16*idx[i+2:i] - dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] + IF k1[j] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 - id := idx[i+3:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := src[i+15:i] + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 15 i := j*16 - id := idx[i+3:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := 0 - FI + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 15 i := j*16 - id := idx[i+3:i]*16 - dst[i+15:i] := a[id+15:id] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - id := idx[i+4:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := src[i+15:i] - FI + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - id := idx[i+4:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := 0 - FI + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - id := idx[i+4:i]*16 - dst[i+15:i] := a[id+15:id] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 - id := idx[i+2:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := src[i+15:i] - FI + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 i := j*16 - id := idx[i+2:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := 0 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*16 - id := idx[i+2:i]*16 - dst[i+15:i] := a[id+15:id] + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := src[i+15:i] + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := 0 + IF k1[j] + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := src[i+15:i] + IF k1[j] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := 0 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + AVX512BW - Arithmetic - - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 7 i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 7 i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := 0 - FI + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + i := j*16 + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := 0 - FI + i := j*16 + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Compare + + + + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + AVX512BW - Arithmetic - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := 0 +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := src[i+31:i] +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := 0 +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ELSE - dst[i+7:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. -FOR j := 0 to 63 +FOR j := 0 to 31 i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. FOR j := 0 to 15 i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ELSE - dst[i+7:i] := src[i+7:i] + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Arithmetic - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI + i := j*8 + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. FOR j := 0 to 15 i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ELSE - dst[i+15:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW - Arithmetic - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Compare + + + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Arithmetic - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. FOR j := 0 to 7 i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ELSE - dst[i+15:i] := src[i+15:i] + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. FOR j := 0 to 7 i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. FOR j := 0 to 31 i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ELSE - dst[i+7:i] := src[i+7:i] + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. FOR j := 0 to 31 i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512BW - Arithmetic - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Compare + + + + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ELSE - dst[i+7:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. -FOR j := 0 to 63 +FOR j := 0 to 15 i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + i := j*16 + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ELSE - dst[i+7:i] := src[i+7:i] + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI + i := j*16 + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ELSE - dst[i+15:i] := src[i+15:i] + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := src[i+15:i] - FI + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := 0 - FI + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := src[i+15:i] - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := 0 - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +FOR j := 0 to 7 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Arithmetic - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512BW + AVX512VL
immintrin.h
-
- - Integer + Shift + + + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512BW - Arithmetic - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI ELSE dst[i+15:i] := 0 - FI + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI ELSE dst[i+15:i] := src[i+15:i] - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI ELSE dst[i+15:i] := 0 - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Arithmetic - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := src[i+15:i] - FI + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := 0 - FI + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 31 +FOR j := 0 to 15 i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := src[i+15:i] - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := 0 - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". - -FOR j := 0 to 31 - i := j*8 - IF a[i+7] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - Mask - AVX512BW - Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". + Shift + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 63 - i := j*8 - IF a[i+7] - k[j] := 1 +FOR j := 0 to 7 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) ELSE - k[j] := 0 + dst[i+15:i] := 0 FI ENDFOR -k[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". - -FOR j := 0 to 15 - i := j*8 - IF a[i+7] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Miscellaneous - - - Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := 0xFF + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + i := j*16 IF k[j] - dst[i+7:i] := 0xFF + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW AVX512VL - Miscellaneous - - - Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := 0xFF + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*16 IF k[j] - dst[i+15:i] := 0xFFFF + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := 0xFFFF + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*16 IF k[j] - dst[i+15:i] := 0xFFFF + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - dst[l+7:l] := Saturate8(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Convert - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 8*j +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[l+7:l] := src[l+7:l] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - Store - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) - FI -ENDFOR - - -
immintrin.h
-
- - Integer AVX512VL - AVX512BW - Convert - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 8*j +FOR j := 0 to 7 + i := j*16 IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE - dst[l+7:l] := 0 + dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Unpack and interleave 32 bits from masks "a" and "b", and store the 64-bit result in "dst". -FOR j := 0 to 31 - i := 16*j - l := 8*j - dst[l+7:l] := Saturate8(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 +dst[31:0] := b[31:0] +dst[63:32] := a[31:0] +dst[MAX:64] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Miscellaneous + + + + + + Unpack and interleave 16 bits from masks "a" and "b", and store the 32-bit result in "dst". + +dst[15:0] := b[15:0] +dst[31:16] := a[15:0] +dst[MAX:32] := 0 + + AVX512BW - Convert - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI +FOR i := 0 to 3 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] ENDFOR -dst[MAX:256] := 0 +FOR j := 0 to 7 + i := j*64 + dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - Store - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. +FOR i := 0 to 3 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] +ENDFOR +FOR j := 0 to 7 + i := j*64 + tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR FOR j := 0 to 31 - i := 16*j - l := 8*j + i := j*16 IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. +FOR i := 0 to 3 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] +ENDFOR +FOR j := 0 to 7 + i := j*64 + tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR FOR j := 0 to 31 - i := 16*j - l := 8*j + i := j*16 IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[l+7:l] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". +
immintrin.h
+ Miscellaneous +
+ + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". -FOR j := 0 to 7 - i := 16*j - l := 8*j - dst[l+7:l] := Saturate8(a[i+15:i]) +FOR j := 0 to 3 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + dst[i+127:i] := tmp[127:0] ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 16*j - l := 8*j +FOR j := 0 to 3 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + tmp_dst[i+127:i] := tmp[127:0] +ENDFOR +FOR j := 0 to 63 + i := j*8 IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - Store - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Convert - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Miscellaneous + + + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 16*j - l := 8*j +FOR j := 0 to 3 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + tmp_dst[i+127:i] := tmp[127:0] +ENDFOR +FOR j := 0 to 63 + i := j*8 IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[l+7:l] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 15 +FOR j := 0 to 63 i := j*8 - l := j*16 IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) + dst[i+7:i] := b[i+7:i] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+7:i] := a[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 15 - i := j*8 - l := j*16 +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) + dst[i+15:i] := b[i+15:i] ELSE - dst[l+15:l] := 0 + dst[i+15:i] := a[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst". -FOR j := 0 to 31 +FOR j := 0 to 63 i := j*8 - l := j*16 - dst[l+15:l] := SignExtend16(a[i+7:i]) + dst[i+7:i] := a[7:0] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 63 i := j*8 - l := j*16 IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) + dst[i+7:i] := a[7:0] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 63 i := j*8 - l := j*16 IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) + dst[i+7:i] := a[7:0] ELSE - dst[l+15:l] := 0 + dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Miscellaneous + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Convert - - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*8 - l := j*16 +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) + dst[i+15:i] := a[15:0] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*8 - l := j*16 +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) + dst[i+15:i] := a[15:0] ELSE - dst[l+15:l] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - dst[l+7:l] := SaturateU8(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Convert - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Miscellaneous + + + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 8*j +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) + off := 16*idx[i+4:i] + dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+15:i] := idx[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - Store - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 8*j +FOR j := 0 to 31 + i := j*16 IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) + off := 16*idx[i+4:i] + dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := a[i+15:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 8*j +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) + off := 16*idx[i+4:i] + dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] ELSE - dst[l+7:l] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW - Convert - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". FOR j := 0 to 31 - i := 16*j - l := 8*j - dst[l+7:l] := SaturateU8(a[i+15:i]) + i := j*16 + off := 16*idx[i+4:i] + dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512BW - Convert - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 - i := 16*j - l := 8*j + i := j*16 + id := idx[i+4:i]*16 IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) + dst[i+15:i] := a[id+15:id] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - Store - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512BW - Convert - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Miscellaneous + + + + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := 16*j - l := 8*j + i := j*16 + id := idx[i+4:i]*16 IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) + dst[i+15:i] := a[id+15:id] ELSE - dst[l+7:l] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". -FOR j := 0 to 7 - i := 16*j - l := 8*j - dst[l+7:l] := SaturateU8(a[i+15:i]) +FOR j := 0 to 31 + i := j*16 + id := idx[i+4:i]*16 + dst[i+15:i] := a[id+15:id] ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) +FOR j := 0 to 63 + i := j*8 + IF a[i+7] + k[j] := 1 ELSE - dst[l+7:l] := src[l+7:l] + k[j] := 0 FI ENDFOR -dst[MAX:64] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - Store - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Convert - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Miscellaneous + + + + + Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". -FOR j := 0 to 7 - i := 16*j - l := 8*j +FOR j := 0 to 63 + i := j*8 IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) + dst[i+7:i] := 0xFF ELSE - dst[l+7:l] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". +
+ + + + Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 - IF a[i+15] - k[j] := 1 + IF k[j] + dst[i+15:i] := 0xFFFF ELSE - k[j] := 0 + dst[i+15:i] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW +
immintrin.h
Miscellaneous - - +
+ + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". FOR j := 0 to 31 @@ -22447,5196 +21336,5568 @@ FOR j := 0 to 31 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". +
+ + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". +FOR j := 0 to 63 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +ENDFOR FOR j := 0 to 7 - i := j*16 - IF a[i+15] - k[j] := 1 - ELSE - k[j] := 0 - FI + i := j*64 + dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] + dst[i+63:i+16] := 0 ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - dst[l+7:l] := Truncate8(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Convert - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Miscellaneous + + + + + + + + Shuffle 8-bit integers in "a" within 128-bit lanes using the control in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 8*j +FOR j := 0 to 63 + i := j*8 IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[5:0] := b[i+3:i] + (j & 0x30) + dst[i+7:i] := a[index*8+7:index*8] + FI ELSE - dst[l+7:l] := src[l+7:l] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - Store - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Convert - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Miscellaneous + + + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 8*j +FOR j := 0 to 63 + i := j*8 IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[5:0] := b[i+3:i] + (j & 0x30) + dst[i+7:i] := a[index*8+7:index*8] + FI ELSE - dst[l+7:l] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 31 - i := 16*j - l := 8*j - dst[l+7:l] := Truncate8(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512BW - Convert - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Miscellaneous + + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) +FOR j := 0 to 63 + i := j*8 + IF b[i+7] == 1 + dst[i+7:i] := 0 ELSE - dst[l+7:l] := src[l+7:l] + index[5:0] := b[i+3:i] + (j & 0x30) + dst[i+7:i] := a[index*8+7:index*8] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - Store - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +tmp_dst[191:128] := a[191:128] +tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +tmp_dst[319:256] := a[319:256] +tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] +tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] +tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] +tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] +tmp_dst[447:384] := a[447:384] +tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] +tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] +tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] +tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] FOR j := 0 to 31 - i := 16*j - l := 8*j + i := j*16 IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +tmp_dst[191:128] := a[191:128] +tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +tmp_dst[319:256] := a[319:256] +tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] +tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] +tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] +tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] +tmp_dst[447:384] := a[447:384] +tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] +tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] +tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] +tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] FOR j := 0 to 31 - i := 16*j - l := 8*j + i := j*16 IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[l+7:l] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst". -FOR j := 0 to 7 - i := 16*j - l := 8*j - dst[l+7:l] := Truncate8(a[i+15:i]) -ENDFOR -dst[MAX:64] := 0 +dst[63:0] := a[63:0] +dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +dst[191:128] := a[191:128] +dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +dst[319:256] := a[319:256] +dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] +dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] +dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] +dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] +dst[447:384] := a[447:384] +dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] +dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] +dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] +dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 16*j - l := 8*j +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +tmp_dst[255:192] := a[255:192] +tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] +tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] +tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] +tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] +tmp_dst[383:320] := a[383:320] +tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] +tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] +tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] +tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] +tmp_dst[511:448] := a[511:448] +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - Store - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Convert - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Miscellaneous + + + + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 16*j - l := 8*j +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +tmp_dst[255:192] := a[255:192] +tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] +tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] +tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] +tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] +tmp_dst[383:320] := a[383:320] +tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] +tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] +tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] +tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] +tmp_dst[511:448] := a[511:448] +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[l+7:l] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst". -FOR j := 0 to 15 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:256] := 0 +dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +dst[127:64] := a[127:64] +dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +dst[255:192] := a[255:192] +dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] +dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] +dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] +dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] +dst[383:320] := a[383:320] +dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] +dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] +dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] +dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] +dst[511:448] := a[511:448] +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) +FOR j := 0 to 63 i := j*8 - l := j*16 IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[l+15:l] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) +FOR j := 0 to 63 i := j*8 - l := j*16 - dst[l+15:l] := ZeroExtend16(a[i+7:i]) + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Miscellaneous + + + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + AVX512BW - Convert - - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 - i := j*8 - l := j*16 + i := j*16 IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Convert - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 - i := j*8 - l := j*16 + i := j*16 IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[l+15:l] := 0 + dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Miscellaneous + + + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + AVX512BW - Convert - - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) +FOR j := 0 to 63 i := j*8 - l := j*16 IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Convert - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) +FOR j := 0 to 63 i := j*8 - l := j*16 IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[l+15:l] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Miscellaneous + + + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + AVX512BW - Arithmetic - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) +FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] + dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) +FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] + dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Miscellaneous + + + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + AVX512BW - Arithmetic - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". - -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Arithmetic - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Load + + + + + + + Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 32 packed 16-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Load + + + + + Load 512-bits (composed of 64 packed 8-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + AVX512BW - Arithmetic - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + Load 32-bit mask from memory into "k". -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 +k[31:0] := MEM[mem_addr+31:mem_addr] - + + AVX512BW
immintrin.h
-
- - Integer + Load + + + + + Load 64-bit mask from memory into "k". + +k[63:0] := MEM[mem_addr+63:mem_addr] + + AVX512BW - Arithmetic - - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + + Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] + dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Move +
+ + + + + Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] + dst[i+15:i] := a[i+15:i] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". +
immintrin.h
+ Move +
+ + + + + + Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Move +
+ + + + + Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] + dst[i+7:i] := a[i+7:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Move +
+ + + + + + Store packed 16-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 + MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] FI ENDFOR -dst[MAX:128] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Store + + + + + + + Store packed 8-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 63 + i := j*8 + IF k[j] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + FI +ENDFOR + + + AVX512BW +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 32 packed 16-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512BW +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 64 packed 8-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + AVX512BW +
immintrin.h
+ Store +
+ + + + + Store 32-bit mask from "a" into memory. + +MEM[mem_addr+31:mem_addr] := a[31:0] + + + AVX512BW +
immintrin.h
+ Store +
+ + + + + Store 64-bit mask from "a" into memory. + +MEM[mem_addr+63:mem_addr] := a[63:0] + + + AVX512BW +
immintrin.h
+ Store +
+ + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := ABS(a[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
Arithmetic - - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] + dst[i+7:i] := ABS(a[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW +
immintrin.h
Arithmetic - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] + dst[i+7:i] := ABS(a[i+7:i]) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Arithmetic + + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ABS(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW +
immintrin.h
Arithmetic - - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] + dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW +
immintrin.h
Arithmetic - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] + dst[i+15:i] := ABS(a[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW +
immintrin.h
Arithmetic - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". +
+ + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW +
immintrin.h
Arithmetic - - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] + dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW +
immintrin.h
Arithmetic - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] + dst[i+7:i] := a[i+7:i] + b[i+7:i] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Arithmetic + + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW +
immintrin.h
Arithmetic - - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW +
immintrin.h
Arithmetic - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Arithmetic + + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW +
immintrin.h
Arithmetic - - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW +
immintrin.h
Arithmetic - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW +
immintrin.h
Arithmetic - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW +
immintrin.h
Arithmetic - - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW +
immintrin.h
Arithmetic - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". -FOR j := 0 to 63 - i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -ENDFOR -FOR j := 0 to 7 - i := j*64 - dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ - tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] - dst[i+63:i+16] := 0 +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*8 + i := j*16 IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[4:0] := b[i+3:i] + (j & 0x10) - dst[i+7:i] := a[index*8+7:index*8] - FI + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*8 + i := j*16 IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[4:0] := b[i+3:i] + (j & 0x10) - dst[i+7:i] := a[index*8+7:index*8] - FI + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Arithmetic + + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - - - Shuffle 8-bit integers in "a" within 128-bit lanes using the control in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 31 + i := j*16 IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[5:0] := b[i+3:i] + (j & 0x30) - dst[i+7:i] := a[index*8+7:index*8] - FI + dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 31 + i := j*16 IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[5:0] := b[i+3:i] + (j & 0x30) - dst[i+7:i] := a[index*8+7:index*8] - FI + dst[i+15:i] := a[i+15:i] + b[i+15:i] ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". FOR j := 0 to 63 i := j*8 - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[5:0] := b[i+3:i] + (j & 0x30) - dst[i+7:i] := a[index*8+7:index*8] - FI + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 63 i := j*8 IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[3:0] := b[i+3:i] - dst[i+7:i] := a[index*8+7:index*8] - FI + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ELSE dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 63 i := j*8 IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[3:0] := b[i+3:i] - dst[i+7:i] := a[index*8+7:index*8] - FI + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ELSE dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Arithmetic + + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -tmp_dst[191:128] := a[191:128] -tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -tmp_dst[191:128] := a[191:128] -tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Arithmetic + + + + + + Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -tmp_dst[191:128] := a[191:128] -tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -tmp_dst[319:256] := a[319:256] -tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] -tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] -tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] -tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] -tmp_dst[447:384] := a[447:384] -tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] -tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] -tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] -tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -tmp_dst[191:128] := a[191:128] -tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -tmp_dst[319:256] := a[319:256] -tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] -tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] -tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] -tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] -tmp_dst[447:384] := a[447:384] -tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] -tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] -tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] -tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". -dst[63:0] := a[63:0] -dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -dst[191:128] := a[191:128] -dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -dst[319:256] := a[319:256] -dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] -dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] -dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] -dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] -dst[447:384] := a[447:384] -dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] -dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] -dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] -dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) ELSE - dst[i+15:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -tmp_dst[255:192] := a[255:192] -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -tmp_dst[255:192] := a[255:192] -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Arithmetic + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -tmp_dst[255:192] := a[255:192] -tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] -tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] -tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] -tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] -tmp_dst[383:320] := a[383:320] -tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] -tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] -tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] -tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] -tmp_dst[511:448] := a[511:448] FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -tmp_dst[255:192] := a[255:192] -tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] -tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] -tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] -tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] -tmp_dst[383:320] := a[383:320] -tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] -tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] -tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] -tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] -tmp_dst[511:448] := a[511:448] FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". -dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -dst[127:64] := a[127:64] -dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -dst[255:192] := a[255:192] -dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] -dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] -dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] -dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] -dst[383:320] := a[383:320] -dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] -dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] -dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] -dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] -dst[511:448] := a[511:448] +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) -dst[255:128] := a[255:128] << (tmp*8) -dst[383:256] := a[383:256] << (tmp*8) -dst[511:384] := a[511:384] << (tmp*8) +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] - FI + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := 0 - FI + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] - FI + dst[i+7:i] := src[i+7:i] + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := 0 - FI + dst[i+7:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". -FOR j := 0 to 31 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] - FI + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := 0 - FI + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Arithmetic + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Arithmetic + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Arithmetic + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". FOR j := 0 to 31 i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Arithmetic + + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI + dst[i+7:i] := a[i+7:i] - b[i+7:i] ELSE - dst[i+15:i] := 0 - FI + dst[i+7:i] := src[i+7:i] + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI + dst[i+7:i] := a[i+7:i] - b[i+7:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Arithmetic + + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Arithmetic + + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". FOR j := 0 to 31 i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Arithmetic + + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) ELSE - dst[i+15:i] := src[i+15:i] - FI + dst[i+15:i] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Arithmetic + + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE dst[i+15:i] := src[i+15:i] - FI + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI + dst[i+15:i] := a[i+15:i] - b[i+15:i] ELSE dst[i+15:i] := 0 - FI + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI + dst[i+15:i] := a[i+15:i] - b[i+15:i] ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[143:128] := Saturate16(a[159:128]) +tmp_dst[159:144] := Saturate16(a[191:160]) +tmp_dst[175:160] := Saturate16(a[223:192]) +tmp_dst[191:176] := Saturate16(a[255:224]) +tmp_dst[207:192] := Saturate16(b[159:128]) +tmp_dst[223:208] := Saturate16(b[191:160]) +tmp_dst[239:224] := Saturate16(b[223:192]) +tmp_dst[255:240] := Saturate16(b[255:224]) +tmp_dst[271:256] := Saturate16(a[287:256]) +tmp_dst[287:272] := Saturate16(a[319:288]) +tmp_dst[303:288] := Saturate16(a[351:320]) +tmp_dst[319:304] := Saturate16(a[383:352]) +tmp_dst[335:320] := Saturate16(b[287:256]) +tmp_dst[351:336] := Saturate16(b[319:288]) +tmp_dst[367:352] := Saturate16(b[351:320]) +tmp_dst[383:368] := Saturate16(b[383:352]) +tmp_dst[399:384] := Saturate16(a[415:384]) +tmp_dst[415:400] := Saturate16(a[447:416]) +tmp_dst[431:416] := Saturate16(a[479:448]) +tmp_dst[447:432] := Saturate16(a[511:480]) +tmp_dst[463:448] := Saturate16(b[415:384]) +tmp_dst[479:464] := Saturate16(b[447:416]) +tmp_dst[495:480] := Saturate16(b[479:448]) +tmp_dst[511:496] := Saturate16(b[511:480]) FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI + dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[143:128] := Saturate16(a[159:128]) +tmp_dst[159:144] := Saturate16(a[191:160]) +tmp_dst[175:160] := Saturate16(a[223:192]) +tmp_dst[191:176] := Saturate16(a[255:224]) +tmp_dst[207:192] := Saturate16(b[159:128]) +tmp_dst[223:208] := Saturate16(b[191:160]) +tmp_dst[239:224] := Saturate16(b[223:192]) +tmp_dst[255:240] := Saturate16(b[255:224]) +tmp_dst[271:256] := Saturate16(a[287:256]) +tmp_dst[287:272] := Saturate16(a[319:288]) +tmp_dst[303:288] := Saturate16(a[351:320]) +tmp_dst[319:304] := Saturate16(a[383:352]) +tmp_dst[335:320] := Saturate16(b[287:256]) +tmp_dst[351:336] := Saturate16(b[319:288]) +tmp_dst[367:352] := Saturate16(b[351:320]) +tmp_dst[383:368] := Saturate16(b[383:352]) +tmp_dst[399:384] := Saturate16(a[415:384]) +tmp_dst[415:400] := Saturate16(a[447:416]) +tmp_dst[431:416] := Saturate16(a[479:448]) +tmp_dst[447:432] := Saturate16(a[511:480]) +tmp_dst[463:448] := Saturate16(b[415:384]) +tmp_dst[479:464] := Saturate16(b[447:416]) +tmp_dst[495:480] := Saturate16(b[479:448]) +tmp_dst[511:496] := Saturate16(b[511:480]) FOR j := 0 to 31 i := j*16 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(a[95:64]) +dst[63:48] := Saturate16(a[127:96]) +dst[79:64] := Saturate16(b[31:0]) +dst[95:80] := Saturate16(b[63:32]) +dst[111:96] := Saturate16(b[95:64]) +dst[127:112] := Saturate16(b[127:96]) +dst[143:128] := Saturate16(a[159:128]) +dst[159:144] := Saturate16(a[191:160]) +dst[175:160] := Saturate16(a[223:192]) +dst[191:176] := Saturate16(a[255:224]) +dst[207:192] := Saturate16(b[159:128]) +dst[223:208] := Saturate16(b[191:160]) +dst[239:224] := Saturate16(b[223:192]) +dst[255:240] := Saturate16(b[255:224]) +dst[271:256] := Saturate16(a[287:256]) +dst[287:272] := Saturate16(a[319:288]) +dst[303:288] := Saturate16(a[351:320]) +dst[319:304] := Saturate16(a[383:352]) +dst[335:320] := Saturate16(b[287:256]) +dst[351:336] := Saturate16(b[319:288]) +dst[367:352] := Saturate16(b[351:320]) +dst[383:368] := Saturate16(b[383:352]) +dst[399:384] := Saturate16(a[415:384]) +dst[415:400] := Saturate16(a[447:416]) +dst[431:416] := Saturate16(a[479:448]) +dst[447:432] := Saturate16(a[511:480]) +dst[463:448] := Saturate16(b[415:384]) +dst[479:464] := Saturate16(b[447:416]) +dst[495:480] := Saturate16(b[479:448]) +dst[511:496] := Saturate16(b[511:480]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +tmp_dst[135:128] := Saturate8(a[143:128]) +tmp_dst[143:136] := Saturate8(a[159:144]) +tmp_dst[151:144] := Saturate8(a[175:160]) +tmp_dst[159:152] := Saturate8(a[191:176]) +tmp_dst[167:160] := Saturate8(a[207:192]) +tmp_dst[175:168] := Saturate8(a[223:208]) +tmp_dst[183:176] := Saturate8(a[239:224]) +tmp_dst[191:184] := Saturate8(a[255:240]) +tmp_dst[199:192] := Saturate8(b[143:128]) +tmp_dst[207:200] := Saturate8(b[159:144]) +tmp_dst[215:208] := Saturate8(b[175:160]) +tmp_dst[223:216] := Saturate8(b[191:176]) +tmp_dst[231:224] := Saturate8(b[207:192]) +tmp_dst[239:232] := Saturate8(b[223:208]) +tmp_dst[247:240] := Saturate8(b[239:224]) +tmp_dst[255:248] := Saturate8(b[255:240]) +tmp_dst[263:256] := Saturate8(a[271:256]) +tmp_dst[271:264] := Saturate8(a[287:272]) +tmp_dst[279:272] := Saturate8(a[303:288]) +tmp_dst[287:280] := Saturate8(a[319:304]) +tmp_dst[295:288] := Saturate8(a[335:320]) +tmp_dst[303:296] := Saturate8(a[351:336]) +tmp_dst[311:304] := Saturate8(a[367:352]) +tmp_dst[319:312] := Saturate8(a[383:368]) +tmp_dst[327:320] := Saturate8(b[271:256]) +tmp_dst[335:328] := Saturate8(b[287:272]) +tmp_dst[343:336] := Saturate8(b[303:288]) +tmp_dst[351:344] := Saturate8(b[319:304]) +tmp_dst[359:352] := Saturate8(b[335:320]) +tmp_dst[367:360] := Saturate8(b[351:336]) +tmp_dst[375:368] := Saturate8(b[367:352]) +tmp_dst[383:376] := Saturate8(b[383:368]) +tmp_dst[391:384] := Saturate8(a[399:384]) +tmp_dst[399:392] := Saturate8(a[415:400]) +tmp_dst[407:400] := Saturate8(a[431:416]) +tmp_dst[415:408] := Saturate8(a[447:432]) +tmp_dst[423:416] := Saturate8(a[463:448]) +tmp_dst[431:424] := Saturate8(a[479:464]) +tmp_dst[439:432] := Saturate8(a[495:480]) +tmp_dst[447:440] := Saturate8(a[511:496]) +tmp_dst[455:448] := Saturate8(b[399:384]) +tmp_dst[463:456] := Saturate8(b[415:400]) +tmp_dst[471:464] := Saturate8(b[431:416]) +tmp_dst[479:472] := Saturate8(b[447:432]) +tmp_dst[487:480] := Saturate8(b[463:448]) +tmp_dst[495:488] := Saturate8(b[479:464]) +tmp_dst[503:496] := Saturate8(b[495:480]) +tmp_dst[511:504] := Saturate8(b[511:496]) +FOR j := 0 to 63 + i := j*8 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +tmp_dst[135:128] := Saturate8(a[143:128]) +tmp_dst[143:136] := Saturate8(a[159:144]) +tmp_dst[151:144] := Saturate8(a[175:160]) +tmp_dst[159:152] := Saturate8(a[191:176]) +tmp_dst[167:160] := Saturate8(a[207:192]) +tmp_dst[175:168] := Saturate8(a[223:208]) +tmp_dst[183:176] := Saturate8(a[239:224]) +tmp_dst[191:184] := Saturate8(a[255:240]) +tmp_dst[199:192] := Saturate8(b[143:128]) +tmp_dst[207:200] := Saturate8(b[159:144]) +tmp_dst[215:208] := Saturate8(b[175:160]) +tmp_dst[223:216] := Saturate8(b[191:176]) +tmp_dst[231:224] := Saturate8(b[207:192]) +tmp_dst[239:232] := Saturate8(b[223:208]) +tmp_dst[247:240] := Saturate8(b[239:224]) +tmp_dst[255:248] := Saturate8(b[255:240]) +tmp_dst[263:256] := Saturate8(a[271:256]) +tmp_dst[271:264] := Saturate8(a[287:272]) +tmp_dst[279:272] := Saturate8(a[303:288]) +tmp_dst[287:280] := Saturate8(a[319:304]) +tmp_dst[295:288] := Saturate8(a[335:320]) +tmp_dst[303:296] := Saturate8(a[351:336]) +tmp_dst[311:304] := Saturate8(a[367:352]) +tmp_dst[319:312] := Saturate8(a[383:368]) +tmp_dst[327:320] := Saturate8(b[271:256]) +tmp_dst[335:328] := Saturate8(b[287:272]) +tmp_dst[343:336] := Saturate8(b[303:288]) +tmp_dst[351:344] := Saturate8(b[319:304]) +tmp_dst[359:352] := Saturate8(b[335:320]) +tmp_dst[367:360] := Saturate8(b[351:336]) +tmp_dst[375:368] := Saturate8(b[367:352]) +tmp_dst[383:376] := Saturate8(b[383:368]) +tmp_dst[391:384] := Saturate8(a[399:384]) +tmp_dst[399:392] := Saturate8(a[415:400]) +tmp_dst[407:400] := Saturate8(a[431:416]) +tmp_dst[415:408] := Saturate8(a[447:432]) +tmp_dst[423:416] := Saturate8(a[463:448]) +tmp_dst[431:424] := Saturate8(a[479:464]) +tmp_dst[439:432] := Saturate8(a[495:480]) +tmp_dst[447:440] := Saturate8(a[511:496]) +tmp_dst[455:448] := Saturate8(b[399:384]) +tmp_dst[463:456] := Saturate8(b[415:400]) +tmp_dst[471:464] := Saturate8(b[431:416]) +tmp_dst[479:472] := Saturate8(b[447:432]) +tmp_dst[487:480] := Saturate8(b[463:448]) +tmp_dst[495:488] := Saturate8(b[479:464]) +tmp_dst[503:496] := Saturate8(b[495:480]) +tmp_dst[511:504] := Saturate8(b[511:496]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". -FOR j := 0 to 31 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR +dst[7:0] := Saturate8(a[15:0]) +dst[15:8] := Saturate8(a[31:16]) +dst[23:16] := Saturate8(a[47:32]) +dst[31:24] := Saturate8(a[63:48]) +dst[39:32] := Saturate8(a[79:64]) +dst[47:40] := Saturate8(a[95:80]) +dst[55:48] := Saturate8(a[111:96]) +dst[63:56] := Saturate8(a[127:112]) +dst[71:64] := Saturate8(b[15:0]) +dst[79:72] := Saturate8(b[31:16]) +dst[87:80] := Saturate8(b[47:32]) +dst[95:88] := Saturate8(b[63:48]) +dst[103:96] := Saturate8(b[79:64]) +dst[111:104] := Saturate8(b[95:80]) +dst[119:112] := Saturate8(b[111:96]) +dst[127:120] := Saturate8(b[127:112]) +dst[135:128] := Saturate8(a[143:128]) +dst[143:136] := Saturate8(a[159:144]) +dst[151:144] := Saturate8(a[175:160]) +dst[159:152] := Saturate8(a[191:176]) +dst[167:160] := Saturate8(a[207:192]) +dst[175:168] := Saturate8(a[223:208]) +dst[183:176] := Saturate8(a[239:224]) +dst[191:184] := Saturate8(a[255:240]) +dst[199:192] := Saturate8(b[143:128]) +dst[207:200] := Saturate8(b[159:144]) +dst[215:208] := Saturate8(b[175:160]) +dst[223:216] := Saturate8(b[191:176]) +dst[231:224] := Saturate8(b[207:192]) +dst[239:232] := Saturate8(b[223:208]) +dst[247:240] := Saturate8(b[239:224]) +dst[255:248] := Saturate8(b[255:240]) +dst[263:256] := Saturate8(a[271:256]) +dst[271:264] := Saturate8(a[287:272]) +dst[279:272] := Saturate8(a[303:288]) +dst[287:280] := Saturate8(a[319:304]) +dst[295:288] := Saturate8(a[335:320]) +dst[303:296] := Saturate8(a[351:336]) +dst[311:304] := Saturate8(a[367:352]) +dst[319:312] := Saturate8(a[383:368]) +dst[327:320] := Saturate8(b[271:256]) +dst[335:328] := Saturate8(b[287:272]) +dst[343:336] := Saturate8(b[303:288]) +dst[351:344] := Saturate8(b[319:304]) +dst[359:352] := Saturate8(b[335:320]) +dst[367:360] := Saturate8(b[351:336]) +dst[375:368] := Saturate8(b[367:352]) +dst[383:376] := Saturate8(b[383:368]) +dst[391:384] := Saturate8(a[399:384]) +dst[399:392] := Saturate8(a[415:400]) +dst[407:400] := Saturate8(a[431:416]) +dst[415:408] := Saturate8(a[447:432]) +dst[423:416] := Saturate8(a[463:448]) +dst[431:424] := Saturate8(a[479:464]) +dst[439:432] := Saturate8(a[495:480]) +dst[447:440] := Saturate8(a[511:496]) +dst[455:448] := Saturate8(b[399:384]) +dst[463:456] := Saturate8(b[415:400]) +dst[471:464] := Saturate8(b[431:416]) +dst[479:472] := Saturate8(b[447:432]) +dst[487:480] := Saturate8(b[463:448]) +dst[495:488] := Saturate8(b[479:464]) +dst[503:496] := Saturate8(b[495:480]) +dst[511:504] := Saturate8(b[511:496]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +tmp_dst[143:128] := SaturateU16(a[159:128]) +tmp_dst[159:144] := SaturateU16(a[191:160]) +tmp_dst[175:160] := SaturateU16(a[223:192]) +tmp_dst[191:176] := SaturateU16(a[255:224]) +tmp_dst[207:192] := SaturateU16(b[159:128]) +tmp_dst[223:208] := SaturateU16(b[191:160]) +tmp_dst[239:224] := SaturateU16(b[223:192]) +tmp_dst[255:240] := SaturateU16(b[255:224]) +tmp_dst[271:256] := SaturateU16(a[287:256]) +tmp_dst[287:272] := SaturateU16(a[319:288]) +tmp_dst[303:288] := SaturateU16(a[351:320]) +tmp_dst[319:304] := SaturateU16(a[383:352]) +tmp_dst[335:320] := SaturateU16(b[287:256]) +tmp_dst[351:336] := SaturateU16(b[319:288]) +tmp_dst[367:352] := SaturateU16(b[351:320]) +tmp_dst[383:368] := SaturateU16(b[383:352]) +tmp_dst[399:384] := SaturateU16(a[415:384]) +tmp_dst[415:400] := SaturateU16(a[447:416]) +tmp_dst[431:416] := SaturateU16(a[479:448]) +tmp_dst[447:432] := SaturateU16(a[511:480]) +tmp_dst[463:448] := SaturateU16(b[415:384]) +tmp_dst[479:464] := SaturateU16(b[447:416]) +tmp_dst[495:480] := SaturateU16(b[479:448]) +tmp_dst[511:496] := SaturateU16(b[511:480]) +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI + dst[i+15:i] := tmp_dst[i+15:i] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +tmp_dst[143:128] := SaturateU16(a[159:128]) +tmp_dst[159:144] := SaturateU16(a[191:160]) +tmp_dst[175:160] := SaturateU16(a[223:192]) +tmp_dst[191:176] := SaturateU16(a[255:224]) +tmp_dst[207:192] := SaturateU16(b[159:128]) +tmp_dst[223:208] := SaturateU16(b[191:160]) +tmp_dst[239:224] := SaturateU16(b[223:192]) +tmp_dst[255:240] := SaturateU16(b[255:224]) +tmp_dst[271:256] := SaturateU16(a[287:256]) +tmp_dst[287:272] := SaturateU16(a[319:288]) +tmp_dst[303:288] := SaturateU16(a[351:320]) +tmp_dst[319:304] := SaturateU16(a[383:352]) +tmp_dst[335:320] := SaturateU16(b[287:256]) +tmp_dst[351:336] := SaturateU16(b[319:288]) +tmp_dst[367:352] := SaturateU16(b[351:320]) +tmp_dst[383:368] := SaturateU16(b[383:352]) +tmp_dst[399:384] := SaturateU16(a[415:384]) +tmp_dst[415:400] := SaturateU16(a[447:416]) +tmp_dst[431:416] := SaturateU16(a[479:448]) +tmp_dst[447:432] := SaturateU16(a[511:480]) +tmp_dst[463:448] := SaturateU16(b[415:384]) +tmp_dst[479:464] := SaturateU16(b[447:416]) +tmp_dst[495:480] := SaturateU16(b[479:448]) +tmp_dst[511:496] := SaturateU16(b[511:480]) +FOR j := 0 to 31 i := j*16 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[i+15:i] := tmp_dst[i+15:i] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI +dst[15:0] := SaturateU16(a[31:0]) +dst[31:16] := SaturateU16(a[63:32]) +dst[47:32] := SaturateU16(a[95:64]) +dst[63:48] := SaturateU16(a[127:96]) +dst[79:64] := SaturateU16(b[31:0]) +dst[95:80] := SaturateU16(b[63:32]) +dst[111:96] := SaturateU16(b[95:64]) +dst[127:112] := SaturateU16(b[127:96]) +dst[143:128] := SaturateU16(a[159:128]) +dst[159:144] := SaturateU16(a[191:160]) +dst[175:160] := SaturateU16(a[223:192]) +dst[191:176] := SaturateU16(a[255:224]) +dst[207:192] := SaturateU16(b[159:128]) +dst[223:208] := SaturateU16(b[191:160]) +dst[239:224] := SaturateU16(b[223:192]) +dst[255:240] := SaturateU16(b[255:224]) +dst[271:256] := SaturateU16(a[287:256]) +dst[287:272] := SaturateU16(a[319:288]) +dst[303:288] := SaturateU16(a[351:320]) +dst[319:304] := SaturateU16(a[383:352]) +dst[335:320] := SaturateU16(b[287:256]) +dst[351:336] := SaturateU16(b[319:288]) +dst[367:352] := SaturateU16(b[351:320]) +dst[383:368] := SaturateU16(b[383:352]) +dst[399:384] := SaturateU16(a[415:384]) +dst[415:400] := SaturateU16(a[447:416]) +dst[431:416] := SaturateU16(a[479:448]) +dst[447:432] := SaturateU16(a[511:480]) +dst[463:448] := SaturateU16(b[415:384]) +dst[479:464] := SaturateU16(b[447:416]) +dst[495:480] := SaturateU16(b[479:448]) +dst[511:496] := SaturateU16(b[511:480]) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +tmp_dst[135:128] := SaturateU8(a[143:128]) +tmp_dst[143:136] := SaturateU8(a[159:144]) +tmp_dst[151:144] := SaturateU8(a[175:160]) +tmp_dst[159:152] := SaturateU8(a[191:176]) +tmp_dst[167:160] := SaturateU8(a[207:192]) +tmp_dst[175:168] := SaturateU8(a[223:208]) +tmp_dst[183:176] := SaturateU8(a[239:224]) +tmp_dst[191:184] := SaturateU8(a[255:240]) +tmp_dst[199:192] := SaturateU8(b[143:128]) +tmp_dst[207:200] := SaturateU8(b[159:144]) +tmp_dst[215:208] := SaturateU8(b[175:160]) +tmp_dst[223:216] := SaturateU8(b[191:176]) +tmp_dst[231:224] := SaturateU8(b[207:192]) +tmp_dst[239:232] := SaturateU8(b[223:208]) +tmp_dst[247:240] := SaturateU8(b[239:224]) +tmp_dst[255:248] := SaturateU8(b[255:240]) +tmp_dst[263:256] := SaturateU8(a[271:256]) +tmp_dst[271:264] := SaturateU8(a[287:272]) +tmp_dst[279:272] := SaturateU8(a[303:288]) +tmp_dst[287:280] := SaturateU8(a[319:304]) +tmp_dst[295:288] := SaturateU8(a[335:320]) +tmp_dst[303:296] := SaturateU8(a[351:336]) +tmp_dst[311:304] := SaturateU8(a[367:352]) +tmp_dst[319:312] := SaturateU8(a[383:368]) +tmp_dst[327:320] := SaturateU8(b[271:256]) +tmp_dst[335:328] := SaturateU8(b[287:272]) +tmp_dst[343:336] := SaturateU8(b[303:288]) +tmp_dst[351:344] := SaturateU8(b[319:304]) +tmp_dst[359:352] := SaturateU8(b[335:320]) +tmp_dst[367:360] := SaturateU8(b[351:336]) +tmp_dst[375:368] := SaturateU8(b[367:352]) +tmp_dst[383:376] := SaturateU8(b[383:368]) +tmp_dst[391:384] := SaturateU8(a[399:384]) +tmp_dst[399:392] := SaturateU8(a[415:400]) +tmp_dst[407:400] := SaturateU8(a[431:416]) +tmp_dst[415:408] := SaturateU8(a[447:432]) +tmp_dst[423:416] := SaturateU8(a[463:448]) +tmp_dst[431:424] := SaturateU8(a[479:464]) +tmp_dst[439:432] := SaturateU8(a[495:480]) +tmp_dst[447:440] := SaturateU8(a[511:496]) +tmp_dst[455:448] := SaturateU8(b[399:384]) +tmp_dst[463:456] := SaturateU8(b[415:400]) +tmp_dst[471:464] := SaturateU8(b[431:416]) +tmp_dst[479:472] := SaturateU8(b[447:432]) +tmp_dst[487:480] := SaturateU8(b[463:448]) +tmp_dst[495:488] := SaturateU8(b[479:464]) +tmp_dst[503:496] := SaturateU8(b[495:480]) +tmp_dst[511:504] := SaturateU8(b[511:496]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +tmp_dst[135:128] := SaturateU8(a[143:128]) +tmp_dst[143:136] := SaturateU8(a[159:144]) +tmp_dst[151:144] := SaturateU8(a[175:160]) +tmp_dst[159:152] := SaturateU8(a[191:176]) +tmp_dst[167:160] := SaturateU8(a[207:192]) +tmp_dst[175:168] := SaturateU8(a[223:208]) +tmp_dst[183:176] := SaturateU8(a[239:224]) +tmp_dst[191:184] := SaturateU8(a[255:240]) +tmp_dst[199:192] := SaturateU8(b[143:128]) +tmp_dst[207:200] := SaturateU8(b[159:144]) +tmp_dst[215:208] := SaturateU8(b[175:160]) +tmp_dst[223:216] := SaturateU8(b[191:176]) +tmp_dst[231:224] := SaturateU8(b[207:192]) +tmp_dst[239:232] := SaturateU8(b[223:208]) +tmp_dst[247:240] := SaturateU8(b[239:224]) +tmp_dst[255:248] := SaturateU8(b[255:240]) +tmp_dst[263:256] := SaturateU8(a[271:256]) +tmp_dst[271:264] := SaturateU8(a[287:272]) +tmp_dst[279:272] := SaturateU8(a[303:288]) +tmp_dst[287:280] := SaturateU8(a[319:304]) +tmp_dst[295:288] := SaturateU8(a[335:320]) +tmp_dst[303:296] := SaturateU8(a[351:336]) +tmp_dst[311:304] := SaturateU8(a[367:352]) +tmp_dst[319:312] := SaturateU8(a[383:368]) +tmp_dst[327:320] := SaturateU8(b[271:256]) +tmp_dst[335:328] := SaturateU8(b[287:272]) +tmp_dst[343:336] := SaturateU8(b[303:288]) +tmp_dst[351:344] := SaturateU8(b[319:304]) +tmp_dst[359:352] := SaturateU8(b[335:320]) +tmp_dst[367:360] := SaturateU8(b[351:336]) +tmp_dst[375:368] := SaturateU8(b[367:352]) +tmp_dst[383:376] := SaturateU8(b[383:368]) +tmp_dst[391:384] := SaturateU8(a[399:384]) +tmp_dst[399:392] := SaturateU8(a[415:400]) +tmp_dst[407:400] := SaturateU8(a[431:416]) +tmp_dst[415:408] := SaturateU8(a[447:432]) +tmp_dst[423:416] := SaturateU8(a[463:448]) +tmp_dst[431:424] := SaturateU8(a[479:464]) +tmp_dst[439:432] := SaturateU8(a[495:480]) +tmp_dst[447:440] := SaturateU8(a[511:496]) +tmp_dst[455:448] := SaturateU8(b[399:384]) +tmp_dst[463:456] := SaturateU8(b[415:400]) +tmp_dst[471:464] := SaturateU8(b[431:416]) +tmp_dst[479:472] := SaturateU8(b[447:432]) +tmp_dst[487:480] := SaturateU8(b[463:448]) +tmp_dst[495:488] := SaturateU8(b[479:464]) +tmp_dst[503:496] := SaturateU8(b[495:480]) +tmp_dst[511:504] := SaturateU8(b[511:496]) +FOR j := 0 to 63 + i := j*8 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[i+7:i] := tmp_dst[i+7:i] ELSE - dst[i+15:i] := 0 + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] >> (tmp*8) -dst[255:128] := a[255:128] >> (tmp*8) -dst[383:256] := a[383:256] >> (tmp*8) -dst[511:384] := a[511:384] >> (tmp*8) +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(a[79:64]) +dst[47:40] := SaturateU8(a[95:80]) +dst[55:48] := SaturateU8(a[111:96]) +dst[63:56] := SaturateU8(a[127:112]) +dst[71:64] := SaturateU8(b[15:0]) +dst[79:72] := SaturateU8(b[31:16]) +dst[87:80] := SaturateU8(b[47:32]) +dst[95:88] := SaturateU8(b[63:48]) +dst[103:96] := SaturateU8(b[79:64]) +dst[111:104] := SaturateU8(b[95:80]) +dst[119:112] := SaturateU8(b[111:96]) +dst[127:120] := SaturateU8(b[127:112]) +dst[135:128] := SaturateU8(a[143:128]) +dst[143:136] := SaturateU8(a[159:144]) +dst[151:144] := SaturateU8(a[175:160]) +dst[159:152] := SaturateU8(a[191:176]) +dst[167:160] := SaturateU8(a[207:192]) +dst[175:168] := SaturateU8(a[223:208]) +dst[183:176] := SaturateU8(a[239:224]) +dst[191:184] := SaturateU8(a[255:240]) +dst[199:192] := SaturateU8(b[143:128]) +dst[207:200] := SaturateU8(b[159:144]) +dst[215:208] := SaturateU8(b[175:160]) +dst[223:216] := SaturateU8(b[191:176]) +dst[231:224] := SaturateU8(b[207:192]) +dst[239:232] := SaturateU8(b[223:208]) +dst[247:240] := SaturateU8(b[239:224]) +dst[255:248] := SaturateU8(b[255:240]) +dst[263:256] := SaturateU8(a[271:256]) +dst[271:264] := SaturateU8(a[287:272]) +dst[279:272] := SaturateU8(a[303:288]) +dst[287:280] := SaturateU8(a[319:304]) +dst[295:288] := SaturateU8(a[335:320]) +dst[303:296] := SaturateU8(a[351:336]) +dst[311:304] := SaturateU8(a[367:352]) +dst[319:312] := SaturateU8(a[383:368]) +dst[327:320] := SaturateU8(b[271:256]) +dst[335:328] := SaturateU8(b[287:272]) +dst[343:336] := SaturateU8(b[303:288]) +dst[351:344] := SaturateU8(b[319:304]) +dst[359:352] := SaturateU8(b[335:320]) +dst[367:360] := SaturateU8(b[351:336]) +dst[375:368] := SaturateU8(b[367:352]) +dst[383:376] := SaturateU8(b[383:368]) +dst[391:384] := SaturateU8(a[399:384]) +dst[399:392] := SaturateU8(a[415:400]) +dst[407:400] := SaturateU8(a[431:416]) +dst[415:408] := SaturateU8(a[447:432]) +dst[423:416] := SaturateU8(a[463:448]) +dst[431:424] := SaturateU8(a[479:464]) +dst[439:432] := SaturateU8(a[495:480]) +dst[447:440] := SaturateU8(a[511:496]) +dst[455:448] := SaturateU8(b[399:384]) +dst[463:456] := SaturateU8(b[415:400]) +dst[471:464] := SaturateU8(b[431:416]) +dst[479:472] := SaturateU8(b[447:432]) +dst[487:480] := SaturateU8(b[463:448]) +dst[495:488] := SaturateU8(b[479:464]) +dst[503:496] := SaturateU8(b[495:480]) +dst[511:504] := SaturateU8(b[511:496]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI +FOR j := 0 to 31 + i := 16*j + l := 8*j + dst[l+7:l] := Saturate8(a[i+15:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := 16*j + l := 8*j IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[l+7:l] := Saturate8(a[i+15:i]) ELSE - dst[i+15:i] := 0 - FI + dst[l+7:l] := src[l+7:l] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 15 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := 16*j + l := 8*j IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[l+7:l] := Saturate8(a[i+15:i]) ELSE - dst[i+15:i] := src[i+15:i] - FI + dst[l+7:l] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := 0 - FI + i := j*8 + l := j*16 + dst[l+15:l] := SignExtend16(a[i+7:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) ELSE - dst[i+15:i] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 31 + i := j*8 + l := j*16 IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + dst[l+15:l] := SignExtend16(a[i+7:i]) ELSE - dst[i+15:i] := src[i+15:i] - FI + dst[l+15:l] := 0 + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := 0 - FI +FOR j := 0 to 31 + i := 16*j + l := 8*j + dst[l+7:l] := SaturateU8(a[i+15:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) ELSE - dst[i+15:i] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := 16*j + l := 8*j IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := 16*j + l := 8*j IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[l+7:l] := SaturateU8(a[i+15:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[l+7:l] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI +FOR j := 0 to 31 + i := 16*j + l := 8*j + dst[l+7:l] := Truncate8(a[i+15:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 31 + i := 16*j + l := 8*j IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[l+7:l] := Truncate8(a[i+15:i]) ELSE - dst[i+15:i] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 31 - i := j*16 + i := 16*j + l := 8*j IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) FI ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := 16*j + l := 8*j IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[l+7:l] := Truncate8(a[i+15:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI + i := j*8 + l := j*16 + dst[l+15:l] := ZeroExtend16(a[i+7:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 + i := j*8 + l := j*16 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[l+15:l] := ZeroExtend16(a[i+7:i]) ELSE - dst[i+15:i] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + dst[l+15:l] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Shift - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + + Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Set +
+ + + + + Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI + dst[i+7:i] := a[7:0] ELSE - dst[i+15:i] := src[i+15:i] + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Set +
+ + + + + + Broadcast 16-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI + dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Set +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI + dst[i+15:i] := a[15:0] ELSE dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Set + + + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + AVX512BW - Shift - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 31 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 31 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := 0 - FI + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := 0 - FI + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 63 i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Compare + + + + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + AVX512BW - Arithmetic - - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] + IF k1[j] + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := src[i+7:i] - FI + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] + IF k1[j] + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := 0 - FI + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := src[i+7:i] - FI + k[j] := 0 + FI ENDFOR -dst[MAX:256] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := 0 - FI + k[j] := 0 + FI ENDFOR -dst[MAX:256] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := src[i+7:i] - FI + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := 0 - FI + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Compare + + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + AVX512BW - Arithmetic - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 63 i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 15 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Arithmetic - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Compare + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512BW - Arithmetic - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Compare + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Arithmetic - - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Arithmetic - - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 + Compare + + + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := src[i+7:i] - FI + k[j] := 0 + FI ENDFOR -dst[MAX:256] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := 0 - FI + k[j] := 0 + FI ENDFOR -dst[MAX:256] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := src[i+7:i] - FI + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := 0 - FI + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 63 i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := src[i+7:i] - FI + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 63 i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + IF k1[j] + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 ELSE - dst[i+7:i] := 0 - FI + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Arithmetic - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 + Compare + + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 31 i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Arithmetic - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Compare + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Arithmetic - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Compare + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := 0 - FI + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := 0 - FI + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Arithmetic - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Arithmetic - - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Arithmetic - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Compare + + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := 0 - FI + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*8 + i := j*16 IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 - ELSE + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 31 + i := j*16 IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 - ELSE + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:64] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -k[MAX:64] := 0 +k[MAX:32] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - Mask - AVX512VL + Compare + + + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - Mask - AVX512VL + Compare + + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 - IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - Mask - AVX512VL + Compare + + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - Mask + Compare + + + + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 - ELSE + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 - ELSE + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:8] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -k[MAX:8] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*8 + i := j*16 IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 - ELSE + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 31 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. +
+ + + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. FOR j := 0 to 63 i := j*8 IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. +
+ + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. FOR j := 0 to 63 i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 ENDFOR k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. +
+ + + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 31 + i := j*16 IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. +
+ + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. -FOR j := 0 to 15 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 +FOR j := 0 to 31 + i := j*16 + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. +
+ + + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 63 + i := j*8 IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. +
+ + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. -FOR j := 0 to 15 - i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 +FOR j := 0 to 63 + i := j*8 + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 ENDFOR -k[MAX:16] := 0 +k[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW +
immintrin.h
Compare - - - - +
+ + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. FOR j := 0 to 31 @@ -27649,17 +26910,15 @@ FOR j := 0 to 31 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512BW +
immintrin.h
Compare - - - +
+ + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. FOR j := 0 to 31 @@ -27668,1585 +26927,917 @@ FOR j := 0 to 31 ENDFOR k[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW +
immintrin.h
Compare - - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. +
+ + + + + Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". -FOR j := 0 to 7 +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) +dst[255:128] := a[255:128] << (tmp*8) +dst[383:256] := a[383:256] << (tmp*8) +dst[511:384] := a[511:384] << (tmp*8) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 i := j*16 - IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE - k[j] := 0 - FI + dst[i+15:i] := src[i+15:i] + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512BW - Compare - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Shift + + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) +FOR j := 0 to 31 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] +FOR j := 0 to 31 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Shift + + + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI ELSE - dst[i+7:i] := 0 - FI + dst[i+15:i] := 0 + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Shift + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -FOR j := 0 to 15 +FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer + Shift + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) +FOR j := 0 to 31 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Shift + + + + + + Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] >> (tmp*8) +dst[255:128] := a[255:128] >> (tmp*8) +dst[383:256] := a[383:256] >> (tmp*8) +dst[511:384] := a[511:384] >> (tmp*8) +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := src[i+15:i] - FI + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI ELSE dst[i+15:i] := 0 - FI + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL + Shift + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512BW - Miscellaneous - - - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) FOR j := 0 to 31 - i := j*8 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE - dst[i+7:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 31 + i := j*16 IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI ELSE - dst[i+7:i] := src[i+7:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512BW - Miscellaneous - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Miscellaneous - - - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Miscellaneous - - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Miscellaneous - - - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Miscellaneous - - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512BW - Miscellaneous - - - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 i := j*16 IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI ELSE - dst[i+15:i] := src[i+15:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512BW - Miscellaneous - - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE + IF count[63:0] > 15 dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512BW - Miscellaneous - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Miscellaneous - - - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Miscellaneous - - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -FOR j := 0 to 7 +FOR j := 0 to 31 i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE + IF imm8[7:0] > 15 dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) FI ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512BW - Store - - - - Store 512-bits (composed of 32 packed 16-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - -
immintrin.h
-
- - Integer - AVX512BW - Store - - - - Store 512-bits (composed of 64 packed 8-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Store - - - - Store 256-bits (composed of 16 packed 16-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Store - - - - Store 256-bits (composed of 32 packed 8-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Store - - - - Store 128-bits (composed of 8 packed 16-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Store - - - - Store 128-bits (composed of 16 packed 8-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - -
immintrin.h
-
- - Integer - AVX512BW - Load - - - Load 512-bits (composed of 32 packed 16-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512BW - Load - - - Load 512-bits (composed of 64 packed 8-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Load - - - Load 256-bits (composed of 16 packed 16-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Load - - - Load 256-bits (composed of 32 packed 8-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512BW - Load - - - Load 128-bits (composed of 8 packed 16-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512BW - Load - - - Load 128-bits (composed of 16 packed 8-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Mask - AVX512BW - Mask - - - + Shift + + + + + Add 32-bit masks in "a" and "b", and store the result in "k". k[31:0] := a[31:0] + b[31:0] k[MAX:32] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Add 64-bit masks in "a" and "b", and store the result in "k". k[63:0] := a[63:0] + b[63:0] k[MAX:64] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise AND of 32-bit masks "a" and "b", and store the result in "k". k[31:0] := a[31:0] AND b[31:0] k[MAX:32] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise AND of 64-bit masks "a" and "b", and store the result in "k". k[63:0] := a[63:0] AND b[63:0] k[MAX:64] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise NOT of 32-bit masks "a" and then AND with "b", and store the result in "k". k[31:0] := (NOT a[31:0]) AND b[31:0] k[MAX:32] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise NOT of 64-bit masks "a" and then AND with "b", and store the result in "k". k[63:0] := (NOT a[63:0]) AND b[63:0] k[MAX:64] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - +
+ + + Compute the bitwise NOT of 32-bit mask "a", and store the result in "k". k[31:0] := NOT a[31:0] k[MAX:32] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - +
+ + + Compute the bitwise NOT of 64-bit mask "a", and store the result in "k". k[63:0] := NOT a[63:0] k[MAX:64] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise OR of 32-bit masks "a" and "b", and store the result in "k". k[31:0] := a[31:0] OR b[31:0] k[MAX:32] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise OR of 64-bit masks "a" and "b", and store the result in "k". k[63:0] := a[63:0] OR b[63:0] k[MAX:64] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise XNOR of 32-bit masks "a" and "b", and store the result in "k". k[31:0] := NOT (a[31:0] XOR b[31:0]) k[MAX:32] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise XNOR of 64-bit masks "a" and "b", and store the result in "k". k[63:0] := NOT (a[63:0] XOR b[63:0]) k[MAX:64] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise XOR of 32-bit masks "a" and "b", and store the result in "k". k[31:0] := a[31:0] XOR b[31:0] k[MAX:32] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise XOR of 64-bit masks "a" and "b", and store the result in "k". k[63:0] := a[63:0] XOR b[63:0] k[MAX:64] := 0 - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Shift the bits of 32-bit mask "a" left by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k". k[MAX:0] := 0 @@ -29254,16 +27845,15 @@ IF count[7:0] <= 31 k[31:0] := a[31:0] << count[7:0] FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Shift the bits of 64-bit mask "a" left by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k". k[MAX:0] := 0 @@ -29271,16 +27861,15 @@ IF count[7:0] <= 63 k[63:0] := a[63:0] << count[7:0] FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Shift the bits of 32-bit mask "a" right by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k". k[MAX:0] := 0 @@ -29288,16 +27877,15 @@ IF count[7:0] <= 31 k[31:0] := a[31:0] >> count[7:0] FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Shift the bits of 64-bit mask "a" right by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k". k[MAX:0] := 0 @@ -29305,71 +27893,16 @@ IF count[7:0] <= 63 k[63:0] := a[63:0] >> count[7:0] FI - -
immintrin.h
-
- - Mask - AVX512BW - Load - - - Load 32-bit mask from memory into "k". - -k[31:0] := MEM[mem_addr+31:mem_addr] - - -
immintrin.h
-
- - Mask - AVX512BW - Load - - - Load 64-bit mask from memory into "k". - -k[63:0] := MEM[mem_addr+63:mem_addr] - - -
immintrin.h
-
- - Mask - AVX512BW - Store - - - - Store 32-bit mask from "a" into memory. - -MEM[mem_addr+31:mem_addr] := a[31:0] - - -
immintrin.h
-
- - Mask + AVX512BW - Store - - - - Store 64-bit mask from "a" into memory. - -MEM[mem_addr+63:mem_addr] := a[63:0] - -
immintrin.h
-
- - Mask - AVX512BW Mask - - - - + + + + + + Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". tmp[31:0] := a[31:0] OR b[31:0] @@ -29384,16 +27917,15 @@ ELSE MEM[all_ones+7:all_ones] := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". tmp[31:0] := a[31:0] OR b[31:0] @@ -29403,16 +27935,15 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". tmp[31:0] := a[31:0] OR b[31:0] @@ -29422,17 +27953,16 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - - +
+ + + + + Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". tmp[63:0] := a[63:0] OR b[63:0] @@ -29447,16 +27977,15 @@ ELSE MEM[all_ones+7:all_ones] := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". tmp[63:0] := a[63:0] OR b[63:0] @@ -29466,16 +27995,15 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". tmp[63:0] := a[63:0] OR b[63:0] @@ -29485,17 +28013,16 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - - +
+ + + + + Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". tmp1[31:0] := a[31:0] AND b[31:0] @@ -29511,16 +28038,15 @@ ELSE MEM[and_not+7:and_not] := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". tmp[31:0] := a[31:0] AND b[31:0] @@ -29530,16 +28056,15 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise NOT of 32-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". tmp[31:0] := (NOT a[31:0]) AND b[31:0] @@ -29549,17 +28074,16 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - - +
+ + + + + Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". tmp1[63:0] := a[63:0] AND b[63:0] @@ -29575,16 +28099,15 @@ ELSE MEM[and_not+7:and_not] := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". tmp[63:0] := a[63:0] AND b[63:0] @@ -29594,16 +28117,15 @@ ELSE dst := 0 FI - -
immintrin.h
-
- - Mask + AVX512BW +
immintrin.h
Mask - - - +
+ + + + Compute the bitwise NOT of 64-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". tmp[63:0] := (NOT a[63:0]) AND b[63:0] @@ -29613,64 +28135,64 @@ ELSE dst := 0 FI - -
immintrin.h
-
- + AVX512BW +
immintrin.h
Mask - - +
+ + + Convert 32-bit mask "a" into an integer value, and store the result in "dst". dst := ZeroExtend32(a[31:0]) - -
immintrin.h
-
- + AVX512BW +
immintrin.h
Mask - - +
+ + + Convert 64-bit mask "a" into an integer value, and store the result in "dst". dst := ZeroExtend64(a[63:0]) - -
immintrin.h
-
- + AVX512BW +
immintrin.h
Mask - - +
+ + + Convert integer value "a" into an 32-bit mask, and store the result in "k". k := ZeroExtend32(a[31:0]) - -
immintrin.h
-
- + AVX512BW +
immintrin.h
Mask - - +
+ + + Convert integer value "a" into an 64-bit mask, and store the result in "k". k := ZeroExtend64(a[63:0]) - + + AVX512BW
immintrin.h
-
- - Integer - AVX512VL - AVX512CD - Miscellaneous - - + Mask + + + + + + Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst". FOR j := 0 to 3 @@ -29679,16 +28201,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Miscellaneous - - +
+ + + Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst". FOR j := 0 to 1 @@ -29697,16 +28218,15 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Miscellaneous - - +
+ + + Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst". FOR j := 0 to 7 @@ -29715,16 +28235,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Miscellaneous - - +
+ + + Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst". FOR j := 0 to 3 @@ -29733,16 +28252,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD - Compare - - + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 7 @@ -29755,18 +28273,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - - - +
+ + + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 7 @@ -29783,17 +28300,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - - +
+ + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 7 @@ -29810,16 +28326,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - +
+ + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 3 @@ -29832,18 +28347,17 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - - - +
+ + + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 3 @@ -29860,17 +28374,16 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - - +
+ + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 3 @@ -29887,16 +28400,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - +
+ + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 3 @@ -29909,18 +28421,17 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - - - +
+ + + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 3 @@ -29937,17 +28448,16 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - - +
+ + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 3 @@ -29964,16 +28474,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - +
+ + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 1 @@ -29986,18 +28495,17 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - - - +
+ + + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 1 @@ -30014,17 +28522,16 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Compare - - - +
+ + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 1 @@ -30041,16 +28548,15 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD - Bit Manipulation - - + AVX512VL +
immintrin.h
+ Compare +
+ + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". FOR j := 0 to 7 @@ -30064,18 +28570,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - - - +
+ + + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -30093,17 +28598,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - - +
+ + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -30121,16 +28625,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - +
+ + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". FOR j := 0 to 3 @@ -30144,18 +28647,17 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - - - +
+ + + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -30173,17 +28675,16 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - - +
+ + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -30201,16 +28702,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - +
+ + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". FOR j := 0 to 3 @@ -30224,18 +28724,17 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - - - +
+ + + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -30253,17 +28752,16 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - - +
+ + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -30281,16 +28779,15 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - +
+ + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". FOR j := 0 to 1 @@ -30304,18 +28801,17 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - - - +
+ + + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 @@ -30333,17 +28829,16 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512CD + AVX512VL +
immintrin.h
Bit Manipulation - - - +
+ + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 @@ -30361,15 +28856,17 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512CD - Swizzle - - + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst". FOR j := 0 to 7 @@ -30378,15 +28875,14 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Swizzle - - +
+ + + Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst". FOR j := 0 to 15 @@ -30395,15 +28891,14 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD - Compare - - +
immintrin.h
+ Swizzle +
+ + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 15 @@ -30416,17 +28911,16 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Compare - - - - +
+ + + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 15 @@ -30443,16 +28937,15 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Compare - - - +
+ + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 15 @@ -30469,15 +28962,14 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Compare - - +
+ + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 7 @@ -30490,17 +28982,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Compare - - - - +
+ + + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 7 @@ -30517,16 +29008,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Compare - - - +
+ + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". FOR j := 0 to 7 @@ -30543,15 +29033,14 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD - Bit Manipulation - - +
immintrin.h
+ Compare +
+ + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". FOR j := 0 to 15 @@ -30565,17 +29054,16 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Bit Manipulation - - - - +
+ + + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 @@ -30593,16 +29081,15 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Bit Manipulation - - - +
+ + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 @@ -30620,15 +29107,14 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Bit Manipulation - - +
+ + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". FOR j := 0 to 7 @@ -30642,17 +29128,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Bit Manipulation - - - - +
+ + + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -30670,16 +29155,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512CD +
immintrin.h
Bit Manipulation - - - +
+ + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -30697,19 +29181,19 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512CD
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Logical - - - - - + Bit Manipulation + + + + + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -30722,18 +29206,17 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - +
+ + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -30746,84 +29229,18 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ - Logical - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512DQ - Logical - - - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512DQ +
immintrin.h
Logical - - - - - +
+ + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 @@ -30836,18 +29253,17 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - +
+ + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 @@ -30860,19 +29276,18 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - - +
+ + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -30885,18 +29300,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - +
+ + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -30909,84 +29323,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ - Logical - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512DQ - Logical - - - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512DQ +
immintrin.h
Logical - - - - - +
+ + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -30999,18 +29347,17 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - +
+ + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -31023,19 +29370,18 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - - +
+ + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -31048,18 +29394,17 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - +
+ + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -31072,84 +29417,18 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ - Logical - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512DQ - Logical - - - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512DQ +
immintrin.h
Logical - - - - - +
+ + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 @@ -31162,18 +29441,17 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - +
+ + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 @@ -31186,19 +29464,18 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - - +
+ + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -31211,18 +29488,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - +
+ + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -31235,39 +29511,21 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512DQ Logical - - - - - + + + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) @@ -31275,22 +29533,22 @@ FOR j := 0 to 15 dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - +
+ + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) @@ -31298,398 +29556,394 @@ FOR j := 0 to 15 dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) + dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Logical - - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) + dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - n := (j % 2)*32 - dst[i+31:i] := a[n+31:n] +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512DQ AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512DQ - Miscellaneous - - - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 - n := (j % 2)*32 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 - n := (j % 2)*32 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 - n := (j % 2)*32 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 - n := (j % 2)*32 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - n := (j % 8)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 8)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - n := (j % 8)*32 +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - n := (j % 2)*64 - dst[i+63:i] := a[n+63:n] + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 - n := (j % 2)*64 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 - n := (j % 2)*64 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 - dst[i+63:i] := a[n+63:n] + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 + i := j*32 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- + + AVX512DQ AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512DQ - Miscellaneous - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. + AVX512VL +
immintrin.h
+ Logical +
+ + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". FOR j := 0 to 7 i := j*32 @@ -31698,18 +29952,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 @@ -31722,17 +29976,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 @@ -31745,100 +29999,103 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. +
+ + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 - dst[i+31:i] := a[n+31:n] +FOR j := 0 to 3 + i := j*64 + n := (j % 2)*64 + dst[i+63:i] := a[n+63:n] ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 +FOR j := 0 to 3 + i := j*64 + n := (j % 2)*64 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+63:i] := a[n+63:n] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 +FOR j := 0 to 3 + i := j*64 + n := (j % 2)*64 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+63:i] := a[n+63:n] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - +
+ + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 n := (j % 2)*32 dst[i+31:i] := a[n+31:n] ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - +
+ + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 n := (j % 2)*32 IF k[j] @@ -31847,21 +30104,21 @@ FOR j := 0 to 3 dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - +
+ + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 n := (j % 2)*32 IF k[j] @@ -31870,79 +30127,82 @@ FOR j := 0 to 3 dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst". +
+ + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 - n := (j % 8)*32 + n := (j % 2)*32 dst[i+31:i] := a[n+31:n] ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - - Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 - n := (j % 8)*32 + n := (j % 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 - n := (j % 8)*32 + n := (j % 2)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - +
+ + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst". FOR j := 0 to 3 @@ -31952,17 +30212,17 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - +
+ + + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -31976,16 +30236,16 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - +
+ + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -31999,2197 +30259,2449 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst". +
+ + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". -FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 - dst[i+63:i] := a[n+63:n] -ENDFOR -dst[MAX:512] := 0 +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 - -
immintrin.h
-
- + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - - Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 1 i := j*64 - n := (j % 2)*64 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- + AVX512DQ + AVX512VL +
immintrin.h
Miscellaneous - - - - Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 1 i := j*64 - n := (j % 2)*64 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512DQ - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 + Miscellaneous + + + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 3 i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ENDFOR -dst[MAX:512] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 3 i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + IF k1[j] + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ELSE - dst[i+63:i] := src[i+63:i] + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ENDFOR -dst[MAX:512] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + IF k1[j] + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ELSE - dst[i+63:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:2] := 0 - + + AVX512DQ + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Miscellaneous + + + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 7 + i := j*32 + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) +ENDFOR +k[MAX:8] := 0 + + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) ELSE - dst[i+63:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 3 + i := j*32 + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - + + AVX512DQ + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Miscellaneous + + + + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512DQ AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE imm8[0] OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + AVX512DQ - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) -ENDFOR +dst[255:0] := a[255:0] +CASE imm8[0] OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512DQ - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + Miscellaneous + + + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + i := j*32 + IF a[i+31] + k[j] := 1 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) +FOR j := 0 to 3 + i := j*32 + IF a[i+31] + k[j] := 1 ELSE - dst[i+63:i] := src[i+63:i] + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". FOR j := 0 to 7 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + dst[i+31:i] := 0xFFFFFFFF ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + dst[i+31:i] := 0xFFFFFFFF ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + dst[i+63:i] := 0xFFFFFFFFFFFFFFFF ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - Integer AVX512VL - AVX512DQ - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + dst[i+63:i] := 0xFFFFFFFFFFFFFFFF ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + IF a[i+63] + k[j] := 1 ELSE - dst[i+63:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + IF a[i+63] + k[j] := 1 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:256] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} FOR j := 0 to 3 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} FOR j := 0 to 3 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512DQ - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + Miscellaneous + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. -FOR j := 0 to 7 +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 3 i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. -FOR j := 0 to 7 +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 1 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. -FOR j := 0 to 7 +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 1 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. -FOR j := 0 to 7 +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 1 i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} FOR j := 0 to 7 - i := j*64 - l := j*32 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. -FOR j := 0 to 1 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + + AVX512DQ AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) +ENDFOR +dst[MAX:256] := 0 + + AVX512DQ - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. -FOR j := 0 to 1 - i := j*64 - l := j*32 +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. -FOR j := 0 to 1 - i := j*64 - l := j*32 +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} FOR j := 0 to 3 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + i := j*32 + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} FOR j := 0 to 3 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} FOR j := 0 to 3 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512DQ - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + Miscellaneous + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] -FOR j := 0 to 7 +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 3 i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -FOR j := 0 to 7 +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 1 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -FOR j := 0 to 7 +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 1 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512DQ + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Miscellaneous + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + AVX512DQ - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} FOR j := 0 to 7 - i := j*64 - l := j*32 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} FOR j := 0 to 7 - i := j*64 - l := j*32 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] -FOR j := 0 to 1 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + RETURN tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -FOR j := 0 to 1 - i := j*64 - l := j*32 +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -FOR j := 0 to 1 - i := j*64 - l := j*32 +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + + AVX512DQ AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + AVX512DQ - Convert - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - [round_note] +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". FOR j := 0 to 3 i := j*64 l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE - dst[l+31:l] := src[l+31:l] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE - dst[l+31:l] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE - dst[l+31:l] := src[l+31:l] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) ELSE - dst[l+31:l] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) ELSE - dst[l+31:l] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". FOR j := 0 to 1 i := j*64 l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) ENDFOR -dst[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) ELSE - dst[l+31:l] := src[l+31:l] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) ELSE - dst[l+31:l] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". +
+ + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512DQ Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512DQ + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Convert + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 + l := j*32 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 + l := j*32 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". +
+ + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 + l := j*32 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 + l := j*32 IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". FOR j := 0 to 3 i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512DQ Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512DQ + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Convert + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) @@ -34197,19 +32709,17 @@ FOR j := 0 to 7 dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - +
+ + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". FOR j := 0 to 1 @@ -34218,19 +32728,17 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 @@ -34243,18 +32751,16 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 @@ -34267,17 +32773,15 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - +
+ + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". FOR j := 0 to 3 @@ -34287,19 +32791,17 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -34313,18 +32815,16 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 @@ -34338,1292 +32838,1472 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 l := j*32 dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". FOR j := 0 to 1 i := j*64 l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 3 i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 1 i := j*64 l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ + AVX512VL +
immintrin.h
Convert - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". +
+ + + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - + + AVX512DQ + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Arithmetic + + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512DQ - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - [round_note] + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". FOR j := 0 to 7 i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - Integer + Logical + + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - l := j*32 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ELSE - dst[l+31:l] := src[l+31:l] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL + AVX512DQ - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - l := j*32 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ELSE - dst[l+31:l] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - Integer - AVX512DQ - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512DQ - Convert - - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + Logical + + + + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE - dst[l+31:l] := src[l+31:l] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE - dst[l+31:l] := src[l+31:l] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512DQ - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 7 i := j*64 - l := j*32 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE - dst[l+31:l] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - Integer + Logical + + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - l := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE - dst[l+31:l] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Logical + + + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Logical + + + + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - l := j*32 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE - dst[l+31:l] := src[l+31:l] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Logical + + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". -FOR j := 0 to 1 - i := j*64 - l := j*32 +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + dst[i+31:i] := a[n+31:n] ELSE - dst[l+31:l] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". +
+ + + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE imm8[0] OF -0: dst[255:0] := a[255:0] -1: dst[255:0] := a[511:256] -ESAC -dst[MAX:256] := 0 +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 + n := (j % 8)*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 + n := (j % 8)*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". +
+ + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*64 + n := (j % 2)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 + n := (j % 2)*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 + n := (j % 2)*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. + +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ +
immintrin.h
Miscellaneous - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". +
+ + + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -CASE imm8[1:0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -2: dst[127:0] := a[383:256] -3: dst[127:0] := a[511:384] -ESAC -dst[MAX:128] := 0 +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst". -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 1 +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + n := (j % 2)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 + n := (j % 2)*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 + n := (j % 2)*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512DQ +
immintrin.h
Miscellaneous - - - - Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". +
+ + + + + Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". CASE imm8[0] OF 0: dst[255:0] := a[255:0] @@ -35631,19 +34311,18 @@ CASE imm8[0] OF ESAC dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). CASE imm8[0] OF 0: tmp[255:0] := a[255:0] @@ -35659,18 +34338,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). CASE imm8[0] OF 0: tmp[255:0] := a[255:0] @@ -35686,43 +34364,43 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". +
+ + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". -CASE imm8[0] OF +CASE imm8[1:0] OF 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] +2: dst[127:0] := a[383:256] +3: dst[127:0] := a[511:384] ESAC dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -CASE imm8[0] OF +CASE imm8[1:0] OF 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 1 i := j*64 @@ -35734,23 +34412,23 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE imm8[0] OF +CASE imm8[1:0] OF 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 1 i := j*64 @@ -35762,16 +34440,85 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[255:0] := a[255:0] +1: dst[255:0] := a[511:256] +ESAC +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ
immintrin.h
-
- - Integer + Miscellaneous + + + + + + + Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512DQ +
immintrin.h
Miscellaneous - - - +
+ + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". CASE imm8[1:0] OF @@ -35782,18 +34529,17 @@ CASE imm8[1:0] OF ESAC dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512DQ +
immintrin.h
Miscellaneous - - - - - +
+ + + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). CASE imm8[1:0] OF @@ -35812,17 +34558,16 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512DQ +
immintrin.h
Miscellaneous - - - - +
+ + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). CASE imm8[1:0] OF @@ -35841,42 +34586,36 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - +
+ + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". [fpclass_note] - FOR j := 0 to 3 + FOR j := 0 to 7 i := j*64 k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) ENDFOR -k[MAX:4] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - +
+ + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [fpclass_note] - FOR j := 0 to 3 + FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) @@ -35884,153 +34623,18 @@ k[MAX:4] := 0 k[j] := 0 FI ENDFOR -k[MAX:4] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Floating Point - Mask + AVX512DQ +
immintrin.h
Miscellaneous - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 7 - i := j*64 - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512DQ - Miscellaneous - - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL - AVX512DQ - Miscellaneous - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 1 - i := j*64 - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) -ENDFOR -k[MAX:2] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL - AVX512DQ - Miscellaneous - - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL - AVX512DQ - Miscellaneous - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 7 - i := j*32 - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL - AVX512DQ - Miscellaneous - - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512DQ - Miscellaneous - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + + + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". [fpclass_note] FOR j := 0 to 15 i := j*32 @@ -36038,18 +34642,16 @@ k[MAX:8] := 0 ENDFOR k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point - Mask + AVX512DQ +
immintrin.h
Miscellaneous - - - - +
+ + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [fpclass_note] FOR j := 0 to 15 @@ -36062,79 +34664,30 @@ k[MAX:16] := 0 ENDFOR k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL - AVX512DQ - Miscellaneous - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 3 - i := j*32 - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) -ENDFOR -k[MAX:4] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL + AVX512DQ - Miscellaneous - - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512DQ Miscellaneous - - - + + + + + Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k". [fpclass_note] k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0]) k[MAX:1] := 0 - -
immintrin.h
-
- - Floating Point - Mask + AVX512DQ +
immintrin.h
Miscellaneous - - - - +
+ + + + + Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [fpclass_note] IF k1[0] @@ -36144,34 +34697,30 @@ ELSE FI k[MAX:1] := 0 - -
immintrin.h
-
- - Floating Point - Mask + AVX512DQ +
immintrin.h
Miscellaneous - - - +
+ + + + Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k. [fpclass_note] k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0]) k[MAX:1] := 0 - -
immintrin.h
-
- - Floating Point - Mask + AVX512DQ +
immintrin.h
Miscellaneous - - - - +
+ + + + + Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [fpclass_note] IF k1[0] @@ -36181,17 +34730,16 @@ ELSE FI k[MAX:1] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - +
+ + + + + Copy "a" to "dst", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". dst[511:0] := a[511:0] @@ -36201,19 +34749,18 @@ CASE (imm8[0]) OF ESAC dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - +
+ + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). tmp[511:0] := a[511:0] @@ -36231,18 +34778,17 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - +
+ + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). tmp[511:0] := a[511:0] @@ -36260,99 +34806,16 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Miscellaneous - - - - - Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE imm8[0] OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Miscellaneous - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ Miscellaneous - - - - + + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". dst[511:0] := a[511:0] @@ -36364,19 +34827,18 @@ CASE imm8[1:0] OF ESAC dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). tmp[511:0] := a[511:0] @@ -36396,18 +34858,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - +
+ + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). tmp[511:0] := a[511:0] @@ -36427,16 +34888,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- + AVX512DQ +
immintrin.h
Miscellaneous - - - - +
+ + + + + Copy "a" to "dst", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". dst[511:0] := a[511:0] @@ -36446,18 +34907,18 @@ CASE imm8[0] OF ESAC dst[MAX:512] := 0 - -
immintrin.h
-
- + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - +
+ + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). tmp[511:0] := a[511:0] @@ -36475,17 +34936,17 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- + AVX512DQ +
immintrin.h
Miscellaneous - - - - - +
+ + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). tmp[511:0] := a[511:0] @@ -36503,95 +34964,16 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - AVX512VL - AVX512DQ - Miscellaneous - - - - - Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE imm8[0] OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - -
immintrin.h
-
- - AVX512VL - AVX512DQ - Miscellaneous - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - AVX512VL + AVX512DQ - Miscellaneous - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512DQ Miscellaneous - - - - + + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". dst[511:0] := a[511:0] @@ -36603,18 +34985,18 @@ CASE imm8[1:0] OF ESAC dst[MAX:512] := 0 - -
immintrin.h
-
- + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). tmp[511:0] := a[511:0] @@ -36634,17 +35016,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- + AVX512DQ +
immintrin.h
Miscellaneous - - - - - +
+ + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). tmp[511:0] := a[511:0] @@ -36664,825 +35046,596 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] +FOR j := 0 to 15 + i := j*32 + IF a[i+31] + k[j] := 1 ELSE - dst[i+63:i] := src[i+63:i] + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+31:i] := 0xFFFFFFFF ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+63:i] := 0xFFFFFFFFFFFFFFFF ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + IF a[i+63] + k[j] := 1 ELSE - dst[i+63:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} FOR j := 0 to 7 i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + IF k[j] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] -FOR j := 0 to 1 +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. -FOR j := 0 to 1 +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI + i := j*64 + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ - Logical - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + Miscellaneous + + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Logical - - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Logical - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512DQ - Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". - -FOR j := 0 to 7 - i := j*32 - IF a[i+31] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512DQ Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". + + + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} FOR j := 0 to 15 - i := j*32 - IF a[i+31] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512VL - AVX512DQ - Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". - -FOR j := 0 to 3 - i := j*32 - IF a[i+31] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512DQ - Miscellaneous - - - Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := 0xFFFFFFFF + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512DQ +
immintrin.h
Miscellaneous - - - Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := 0xFFFFFFFF + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] -FOR j := 0 to 3 +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := 0xFFFFFFFF + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512DQ - Miscellaneous - - - Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := 0xFFFFFFFFFFFFFFFF - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512DQ - Miscellaneous - - - Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := 0xFFFFFFFFFFFFFFFF - ELSE - dst[i+63:i] := 0 - FI -ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512DQ - Miscellaneous - - - Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := 0xFFFFFFFFFFFFFFFF - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512VL - AVX512DQ - Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". - -FOR j := 0 to 3 - i := j*64 - IF a[i+63] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - -
immintrin.h
-
- - Integer - Mask + AVX512DQ - Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". - -FOR j := 0 to 7 - i := j*64 - IF a[i+63] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL - AVX512DQ Miscellaneous - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". - -FOR j := 0 to 1 - i := j*64 - IF a[i+63] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512DQ - Arithmetic - - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512DQ - Arithmetic - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512DQ - Arithmetic - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512DQ - Arithmetic - - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512DQ - Arithmetic - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512DQ - Arithmetic - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". + Miscellaneous + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512DQ - Arithmetic - - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512DQ - Arithmetic - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512DQ - Arithmetic - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] OF @@ -37501,30 +35654,27 @@ DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { RETURN dst } -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 +IF k[0] + dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. @@ -37545,31 +35695,29 @@ DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { RETURN dst } -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 +IF k[0] + dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] OF @@ -37588,114 +35736,26 @@ DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { RETURN dst } -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. @@ -37716,111 +35776,26 @@ DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { RETURN dst } -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] @@ -37841,157 +35816,26 @@ DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { RETURN dst } -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR +dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] OF @@ -38003,37 +35847,34 @@ DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE signSelCtl[1:0] OF 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] + 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 +IF k[0] + dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. @@ -38047,38 +35888,36 @@ DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE signSelCtl[1:0] OF 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] + 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 +IF k[0] + dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE opCtl[1:0] OF @@ -38090,33 +35929,33 @@ DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE signSelCtl[1:0] OF 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] + 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:256] := 0 +IF k[0] + dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. @@ -38130,38 +35969,33 @@ DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE signSelCtl[1:0] OF 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] + 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] @@ -38175,3346 +36009,3427 @@ DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { CASE signSelCtl[1:0] OF 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] + 1: dst[31:0] := tmp[31:0] 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) ESAC RETURN dst } -FOR j := 0 to 15 - i := j*32 +dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +
+ + + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] } -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +
+ + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] } -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +
+ + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] } -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +
+ + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] } -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512VL + Miscellaneous + + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +
+ + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] } -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512VL + Miscellaneous + + + + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +
+ + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] } -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512VL + Miscellaneous + + + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +
+ + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] } -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +
+ + + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] } IF k[0] - dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) + dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +
+ + + + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] } IF k[0] - dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) + dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +
+ + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] } IF k[0] - dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) + dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +
+ + + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] } IF k[0] - dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) + dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +
+ + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note] -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] } -dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +
+ + + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] } IF k[0] - dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) + dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +
+ + + + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] } IF k[0] - dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) + dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +
+ + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] } IF k[0] - dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) + dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. +
+ + + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] } IF k[0] - dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) + dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] +
+ + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] } -dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
+ + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) FI - RETURN tmp[63:0] + RETURN tmp[31:0] } -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 +dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ +
immintrin.h
Miscellaneous - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + [round_note] -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + [round_note] -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} FOR j := 0 to 7 i := j*64 - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} FOR j := 0 to 7 i := j*64 - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + Convert + + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + [round_note] -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - RETURN tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI -} FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + [round_note] -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + Convert + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Miscellaneous - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Miscellaneous - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) + dst[i+63:i] := src[i+63:i] FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note] + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 FI - RETURN tmp[63:0] -} -dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + Convert + + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + [round_note] -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512DQ - Miscellaneous - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + Convert + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Miscellaneous - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512DQ - Logical - - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Convert + + + + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] FOR j := 0 to 7 i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 + l := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 - i := j*32 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512DQ - Logical - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512DQ - Logical - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - Add 8-bit masks in "a" and "b", and store the result in "k". - -k[7:0] := a[7:0] + b[7:0] -k[MAX:8] := 0 - -
immintrin.h
-
- - Mask - AVX512DQ - Mask - - - - Add 16-bit masks in "a" and "b", and store the result in "k". + Convert + + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] -k[15:0] := a[15:0] + b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - Compute the bitwise AND of 8-bit masks "a" and "b", and store the result in "k". - -k[7:0] := a[7:0] AND b[7:0] -k[MAX:8] := 0 - -
immintrin.h
-
- - Mask - AVX512DQ - Mask - - - - Compute the bitwise NOT of 8-bit masks "a" and then AND with "b", and store the result in "k". + Convert + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -k[7:0] := (NOT a[7:0]) AND b[7:0] -k[MAX:8] := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - Compute the bitwise NOT of 8-bit mask "a", and store the result in "k". - -k[7:0] := NOT a[7:0] -k[MAX:8] := 0 - -
immintrin.h
-
- - Mask - AVX512DQ - Mask - - - - Compute the bitwise OR of 8-bit masks "a" and "b", and store the result in "k". + Convert + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] -k[7:0] := a[7:0] OR b[7:0] -k[MAX:8] := 0 +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - Compute the bitwise XNOR of 8-bit masks "a" and "b", and store the result in "k". - -k[7:0] := NOT (a[7:0] XOR b[7:0]) -k[MAX:8] := 0 - -
immintrin.h
-
- - Mask - AVX512DQ - Mask - - - - Compute the bitwise XOR of 8-bit masks "a" and "b", and store the result in "k". + Convert + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". -k[7:0] := a[7:0] XOR b[7:0] -k[MAX:8] := 0 +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - Shift the bits of 8-bit mask "a" left by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 7 - k[7:0] := a[7:0] << count[7:0] -FI - -
immintrin.h
-
- - Mask - AVX512DQ - Mask - - - - Shift the bits of 8-bit mask "a" right by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k". + Convert + + + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] -k[MAX:0] := 0 -IF count[7:0] <= 7 - k[7:0] := a[7:0] >> count[7:0] -FI +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Load - - - Load 8-bit mask from memory into "k". - -k[7:0] := MEM[mem_addr+7:mem_addr] - -
immintrin.h
-
- - Mask - AVX512DQ - Store - - - - Store 8-bit mask from "a" into memory. + Convert + + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -MEM[mem_addr+7:mem_addr] := a[7:0] +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - - Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". - -tmp[7:0] := a[7:0] OR b[7:0] -IF tmp[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -IF tmp[7:0] == 0xFF - MEM[all_ones+7:all_ones] := 1 -ELSE - MEM[all_ones+7:all_ones] := 0 -FI - -
immintrin.h
-
- - Mask - AVX512DQ - Mask - - - - Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + Convert + + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] -tmp[7:0] := a[7:0] OR b[7:0] -IF tmp[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". - -tmp[7:0] := a[7:0] OR b[7:0] -IF tmp[7:0] == 0xFF - dst := 1 -ELSE - dst := 0 -FI - -
immintrin.h
-
- - Mask - AVX512DQ - Mask - - - - - Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". + Convert + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp1[7:0] := a[7:0] AND b[7:0] -IF tmp1[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -tmp2[7:0] := (NOT a[7:0]) AND b[7:0] -IF tmp2[7:0] == 0x0 - MEM[and_not+7:and_not] := 1 -ELSE - MEM[and_not+7:and_not] := 0 -FI +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". - -tmp[7:0] := a[7:0] AND b[7:0] -IF tmp[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - -
immintrin.h
-
- - Mask - AVX512DQ - Mask - - - - Compute the bitwise NOT of 8-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] -tmp[7:0] := (NOT a[7:0]) AND b[7:0] -IF tmp[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - - Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". -tmp1[15:0] := a[15:0] AND b[15:0] -IF tmp1[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -tmp2[15:0] := (NOT a[15:0]) AND b[15:0] -IF tmp2[15:0] == 0x0 - MEM[and_not+7:and_not] := 1 -ELSE - MEM[and_not+7:and_not] := 0 -FI +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] -tmp[15:0] := a[15:0] AND b[15:0] -IF tmp[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Mask + AVX512DQ - Mask - - - - Compute the bitwise NOT of 16-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp[15:0] := (NOT a[15:0]) AND b[15:0] -IF tmp[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- + AVX512DQ - Mask - - - Convert 8-bit mask "a" into an integer value, and store the result in "dst". +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] -dst := ZeroExtend32(a[7:0]) +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- + AVX512DQ - Mask - - - Convert integer value "a" into an 8-bit mask, and store the result in "k". +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -k := a[7:0] +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. [sae_note] + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. + Convert + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - - Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note] + Convert + + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note] + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. [sae_note] + Convert + + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + [round_note] FOR j := 0 to 7 i := j*64 - dst[i+63:i] := POW(2.0, a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. + Convert + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 7 i := j*64 - dst[i+63:i] := POW(2.0, a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - - Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note] + Convert + + + + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := POW(2.0, a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. + Convert + + + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := POW(2.0, a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note] + Convert + + + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := POW(2.0, a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. + Convert + + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := POW(2.0, a[i+63:i]) + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + Convert + + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] -dst[63:0] := (1.0 / b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. + Convert + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". -dst[63:0] := (1.0 / b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + Convert + + + + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -IF k[0] - dst[63:0] := (1.0 / b[63:0]) -ELSE - dst[63:0] := src[63:0] +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". + +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Arithmetic +
+ + + + + Add 8-bit masks in "a" and "b", and store the result in "k". + +k[7:0] := a[7:0] + b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Add 16-bit masks in "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] + b[15:0] +k[MAX:16] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 8-bit masks "a" and "b", and store the result in "k". + +k[7:0] := a[7:0] AND b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 8-bit masks "a" and then AND with "b", and store the result in "k". + +k[7:0] := (NOT a[7:0]) AND b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + Compute the bitwise NOT of 8-bit mask "a", and store the result in "k". + +k[7:0] := NOT a[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 8-bit masks "a" and "b", and store the result in "k". + +k[7:0] := a[7:0] OR b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XNOR of 8-bit masks "a" and "b", and store the result in "k". + +k[7:0] := NOT (a[7:0] XOR b[7:0]) +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XOR of 8-bit masks "a" and "b", and store the result in "k". + +k[7:0] := a[7:0] XOR b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 8-bit mask "a" left by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k". + +k[MAX:0] := 0 +IF count[7:0] <= 7 + k[7:0] := a[7:0] << count[7:0] FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. + Mask + + + + + + Shift the bits of 8-bit mask "a" right by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k". -IF k[0] - dst[63:0] := (1.0 / b[63:0]) +k[MAX:0] := 0 +IF count[7:0] <= 7 + k[7:0] := a[7:0] >> count[7:0] +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". + +tmp[7:0] := a[7:0] OR b[7:0] +IF tmp[7:0] == 0x0 + dst := 1 ELSE - dst[63:0] := src[63:0] + dst := 0 +FI +IF tmp[7:0] == 0xFF + MEM[all_ones+7:all_ones] := 1 +ELSE + MEM[all_ones+7:all_ones] := 0 FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + Mask + + + + + + Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". -IF k[0] - dst[63:0] := (1.0 / b[63:0]) +tmp[7:0] := a[7:0] OR b[7:0] +IF tmp[7:0] == 0x0 + dst := 1 ELSE - dst[63:0] := 0 + dst := 0 FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. + Mask + + + + + + Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". -IF k[0] - dst[63:0] := (1.0 / b[63:0]) +tmp[7:0] := a[7:0] OR b[7:0] +IF tmp[7:0] == 0xFF + dst := 1 ELSE - dst[63:0] := 0 + dst := 0 FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst". The maximum relative error for this approximation is less than 2^-28, and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] + Mask + + + + + + + Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". -dst[31:0] := (1.0 / b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +tmp1[7:0] := a[7:0] AND b[7:0] +IF tmp1[7:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI +tmp2[7:0] := (NOT a[7:0]) AND b[7:0] +IF tmp2[7:0] == 0x0 + MEM[and_not+7:and_not] := 1 +ELSE + MEM[and_not+7:and_not] := 0 +FI - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. + Mask + + + + + + Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". -dst[31:0] := (1.0 / b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +tmp[7:0] := a[7:0] AND b[7:0] +IF tmp[7:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + Mask + + + + + + Compute the bitwise NOT of 8-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". -IF k[0] - dst[31:0] := (1.0 / b[31:0]) +tmp[7:0] := (NOT a[7:0]) AND b[7:0] +IF tmp[7:0] == 0x0 + dst := 1 ELSE - dst[31:0] := src[31:0] + dst := 0 FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. + Mask + + + + + + + Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". -IF k[0] - dst[31:0] := (1.0 / b[31:0]) +tmp1[15:0] := a[15:0] AND b[15:0] +IF tmp1[15:0] == 0x0 + dst := 1 ELSE - dst[31:0] := src[31:0] + dst := 0 +FI +tmp2[15:0] := (NOT a[15:0]) AND b[15:0] +IF tmp2[15:0] == 0x0 + MEM[and_not+7:and_not] := 1 +ELSE + MEM[and_not+7:and_not] := 0 FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + Mask + + + + + + Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". -IF k[0] - dst[31:0] := (1.0 / b[31:0]) +tmp[15:0] := a[15:0] AND b[15:0] +IF tmp[15:0] == 0x0 + dst := 1 ELSE - dst[31:0] := 0 + dst := 0 FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. + Mask + + + + + + Compute the bitwise NOT of 16-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". -IF k[0] - dst[31:0] := (1.0 / b[31:0]) +tmp[15:0] := (NOT a[15:0]) AND b[15:0] +IF tmp[15:0] == 0x0 + dst := 1 ELSE - dst[31:0] := 0 + dst := 0 FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + Convert 8-bit mask "a" into an integer value, and store the result in "dst". + +dst := ZeroExtend32(a[7:0]) + + + AVX512DQ
immintrin.h
-
- - Floating Point - AVX512ER - Elementary Math Functions - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + Mask + + + + + Convert integer value "a" into an 8-bit mask, and store the result in "k". + +k := a[7:0] + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + Load 8-bit mask from memory into "k". + +k[7:0] := MEM[mem_addr+7:mem_addr] + + + AVX512DQ +
immintrin.h
+ Load +
+ + + + + Store 8-bit mask from "a" into memory. + +MEM[mem_addr+7:mem_addr] := a[7:0] + + + AVX512DQ +
immintrin.h
+ Store +
+ + + + + + + Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. [sae_note] FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. +
+ + + + Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] +
+ + + + + + + Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. +
+ + + + + + Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] +
+ + + + + + Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. +
+ + + + + Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] +
+ + + + + Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. [sae_note] FOR j := 0 to 7 i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := POW(2.0, a[i+63:i]) ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. +
+ + + + Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. FOR j := 0 to 7 i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := POW(2.0, a[i+63:i]) ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] +
+ + + + + + + Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := POW(2.0, a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. +
+ + + + + + Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := POW(2.0, a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] +
+ + + + + + Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := POW(2.0, a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. +
+ + + + + Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := POW(2.0, a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] +
+ + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] -dst[63:0] := (1.0 / SQRT(b[63:0])) +dst[63:0] := (1.0 / b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. +
+ + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. -dst[63:0] := (1.0 / SQRT(b[63:0])) +dst[63:0] := (1.0 / b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0 - + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + +IF k[0] + dst[63:0] := (1.0 / b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. + +IF k[0] + dst[63:0] := (1.0 / b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + +IF k[0] + dst[63:0] := (1.0 / b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. + +IF k[0] + dst[63:0] := (1.0 / b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst". The maximum relative error for this approximation is less than 2^-28, and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] + +dst[31:0] := (1.0 / b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. + +dst[31:0] := (1.0 / b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + +IF k[0] + dst[31:0] := (1.0 / b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. + +IF k[0] + dst[31:0] := (1.0 / b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + +IF k[0] + dst[31:0] := (1.0 / b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. + +IF k[0] + dst[31:0] := (1.0 / b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR + + + AVX512ER +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] + +dst[63:0] := (1.0 / SQRT(b[63:0])) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512ER
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. + +dst[63:0] := (1.0 / SQRT(b[63:0])) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - - +
+ + + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] IF k[0] @@ -41525,18 +39440,17 @@ FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - +
+ + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. IF k[0] @@ -41547,18 +39461,17 @@ FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - +
+ + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] IF k[0] @@ -41569,17 +39482,16 @@ FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - +
+ + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. IF k[0] @@ -41590,52 +39502,49 @@ FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - +
+ + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] dst[31:0] := (1.0 / SQRT(b[31:0])) dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - +
+ + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. dst[31:0] := (1.0 / SQRT(b[31:0])) dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - - +
+ + + + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] IF k[0] @@ -41646,18 +39555,17 @@ FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - +
+ + + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. IF k[0] @@ -41668,18 +39576,17 @@ FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - +
+ + + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] IF k[0] @@ -41690,17 +39597,16 @@ FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - +
+ + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. IF k[0] @@ -41711,16 +39617,15 @@ FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - +
+ + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] FOR j := 0 to 15 @@ -41728,15 +39633,14 @@ FOR j := 0 to 15 dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - +
+ + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28. FOR j := 0 to 15 @@ -41744,18 +39648,17 @@ FOR j := 0 to 15 dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - +
+ + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] FOR j := 0 to 15 @@ -41767,17 +39670,16 @@ FOR j := 0 to 15 FI ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - +
+ + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. FOR j := 0 to 15 @@ -41789,17 +39691,16 @@ FOR j := 0 to 15 FI ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - +
+ + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] FOR j := 0 to 15 @@ -41811,16 +39712,15 @@ FOR j := 0 to 15 FI ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - +
+ + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. FOR j := 0 to 15 @@ -41832,16 +39732,15 @@ FOR j := 0 to 15 FI ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - +
+ + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note] FOR j := 0 to 7 @@ -41849,15 +39748,14 @@ FOR j := 0 to 7 dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - +
+ + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28. FOR j := 0 to 7 @@ -41865,18 +39763,17 @@ FOR j := 0 to 7 dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - - +
+ + + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] FOR j := 0 to 7 @@ -41888,17 +39785,16 @@ FOR j := 0 to 7 FI ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - +
+ + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. FOR j := 0 to 7 @@ -41910,17 +39806,16 @@ FOR j := 0 to 7 FI ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - - +
+ + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note] FOR j := 0 to 7 @@ -41932,16 +39827,15 @@ FOR j := 0 to 7 FI ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512ER +
immintrin.h
Elementary Math Functions - - - +
+ + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. FOR j := 0 to 7 @@ -41953,5480 +39847,4785 @@ FOR j := 0 to 7 FI ENDFOR - + + AVX512ER
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + + + Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ACOS(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - AVX512VL - Arithmetic - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + dst[i+63:i] := ACOS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point + Trigonometry + + + + + Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ACOS(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - AVX512VL - Arithmetic - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + dst[i+31:i] := ACOS(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point AVX512F - AVX512VL - Arithmetic - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := ACOSH(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point AVX512F - AVX512VL - Arithmetic - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + dst[i+63:i] := ACOSH(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point + Trigonometry + + + + + Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ACOSH(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - AVX512VL - Arithmetic - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := ACOSH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point + Trigonometry + + + + + Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ASIN(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - AVX512VL - Arithmetic - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+63:i] := ASIN(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point + Trigonometry + + + + + Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ASIN(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - AVX512VL - Arithmetic - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := ASIN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point + Trigonometry + + + + + Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ASINH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - AVX512VL - Arithmetic - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+63:i] := ASINH(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer AVX512F - AVX512VL - Miscellaneous - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (32*imm8[2:0]) -dst[255:0] := temp[255:0] -dst[MAX:256] := 0 +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ASINH(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer AVX512F - AVX512VL - Miscellaneous - - - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (32*imm8[2:0]) -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := temp[i+31:i] + dst[i+31:i] := ASINH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Integer + Trigonometry + + + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - AVX512VL - Miscellaneous - - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (32*imm8[2:0]) FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := temp[i+31:i] + dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer AVX512F - AVX512VL - Miscellaneous - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst". - -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (32*imm8[1:0]) -dst[127:0] := temp[127:0] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512F - AVX512VL - Miscellaneous - - - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Trigonometry + + + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (32*imm8[1:0]) -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := temp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer AVX512F - AVX512VL - Miscellaneous - - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (32*imm8[1:0]) -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := temp[i+31:i] + dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer AVX512F - AVX512VL - Miscellaneous - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians. -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (64*imm8[1:0]) -dst[255:0] := temp[255:0] -dst[MAX:256] := 0 +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ATAN(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer AVX512F - AVX512VL - Miscellaneous - - - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (64*imm8[1:0]) -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := temp[i+63:i] + dst[i+63:i] := ATAN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Integer + Trigonometry + + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ATAN(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - AVX512VL - Miscellaneous - - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (64*imm8[1:0]) -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := temp[i+63:i] + dst[i+31:i] := ATAN(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer AVX512F - AVX512VL - Miscellaneous - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians. -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (64*imm8[0]) -dst[127:0] := temp[127:0] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ATANH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer AVX512F - AVX512VL - Miscellaneous - - - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (64*imm8[0]) -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := temp[i+63:i] + dst[i+63:i] := ATANH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Integer + Trigonometry + + + + + Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ATANH(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - AVX512VL - Miscellaneous - - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (64*imm8[0]) -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := temp[i+63:i] + dst[i+31:i] := ATANH(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI + dst[i+63:i] := COS(a[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := b[i+63:i] + dst[i+63:i] := COS(a[i+63:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Trigonometry + + + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := COS(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Miscellaneous - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := b[i+31:i] + dst[i+31:i] := COS(a[i+31:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Trigonometry + + + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := COSD(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Miscellaneous - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := b[i+31:i] + dst[i+63:i] := COSD(a[i+63:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 7 +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 15 i := j*32 - n := (j % 4)*32 - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := COSD(a[i+31:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 i := j*32 - n := (j % 4)*32 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := COSD(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Trigonometry + + + + + Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := COSH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Miscellaneous - - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 - n := (j % 4)*32 + i := j*64 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+63:i] := COSH(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - AVX512VL AVX512F - Miscellaneous - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - n := (j % 4)*32 - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := COSH(a[i+31:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - AVX512VL AVX512F - Miscellaneous - - - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - n := (j % 4)*32 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := COSH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - AVX512VL AVX512F - Miscellaneous - - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 7 - i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI + i := j*64 + dst[i+63:i] := SIN(a[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[63:0] + dst[i+63:i] := SIN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 - FI +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SIN(a[i+31:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := SIN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Trigonometry + + + + + Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SINH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Miscellaneous - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+63:i] := SINH(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := SINH(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := SINH(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL AVX512F - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 i := j*64 - k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 + dst[i+63:i] := SIND(a[i+63:i]) ENDFOR -k[MAX:4] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL AVX512F - Compare - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+63:i] := SIND(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL AVX512F - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SIND(a[i+31:i]) ENDFOR -k[MAX:2] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL AVX512F - Compare - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SIND(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:2] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL AVX512F - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + FOR j := 0 to 7 - i := j*32 - k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 + i := j*64 + dst[i+63:i] := TAN(a[i+63:i]) ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL AVX512F - Compare - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*64 + IF k[j] + dst[i+63:i] := TAN(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL AVX512F - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + dst[i+31:i] := TAN(a[i+31:i]) ENDFOR -k[MAX:4] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Mask - AVX512VL AVX512F - Compare - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := TAN(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Trigonometry + + + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := TAND(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Miscellaneous - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 64 -m := 0 -FOR j := 0 to 3 +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*64 IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+63:i] := TAND(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[255:m] := src[255:m] -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Trigonometry + + + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := TAND(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Store - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 3 - i := j*64 +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+31:i] := TAND(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -size := 64 -m := 0 -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI + dst[i+63:i] := TANH(a[i+63:i]) ENDFOR -dst[255:m] := 0 -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 64 -m := 0 -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+63:i] := TANH(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[127:m] := src[127:m] -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Store - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 1 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Miscellaneous - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + Trigonometry + + + + + Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -size := 64 -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := TANH(a[i+31:i]) ENDFOR -dst[127:m] := 0 -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 32 -m := 0 -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+31:i] := TANH(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[255:m] := src[255:m] -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Store - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". -size := 32 -m := base_addr FOR j := 0 to 7 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI + i := j*64 + dst[i+63:i] := SIN(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) ENDFOR +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
immintrin.h
+ Trigonometry +
+ + + + + + + + Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). -size := 32 -m := 0 FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+63:i] := SIN(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) + ELSE + dst[i+63:i] := sin_src[i+63:i] + MEM[mem_addr+i+63:mem_addr+i] := cos_src[i+63:i] FI ENDFOR -dst[255:m] := 0 -dst[MAX:256] := 0 +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". -size := 32 -m := 0 -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI + dst[i+31:i] := SIN(a[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) ENDFOR -dst[127:m] := src[127:m] -dst[MAX:128] := 0 +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Store - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Trigonometry +
+ + + + + + + + Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). -size := 32 -m := base_addr -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+31:i] := SIN(a[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) + ELSE + dst[i+31:i] := sin_src[i+31:i] + MEM[mem_addr+i+31:mem_addr+i] := cos_src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 32 -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI +
immintrin.h
+ Trigonometry +
+ + + + Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := CubeRoot(a[i+63:i]) ENDFOR -dst[127:m] := 0 -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - m := j*64 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + dst[i+63:i] := CubeRoot(a[i+63:i]) ELSE - dst[m+63:m] := src[m+63:m] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Elementary Math Functions + + + + + Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := CubeRoot(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 i := j*32 - m := j*64 IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + dst[i+31:i] := CubeRoot(a[i+31:i]) ELSE - dst[m+63:m] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Elementary Math Functions + + + + + Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POW(10.0, a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*32 - m := j*64 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + dst[i+63:i] := POW(10.0, a[i+63:i]) ELSE - dst[m+63:m] := src[m+63:m] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - m := j*64 - IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) - ELSE - dst[m+63:m] := 0 - FI + dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI + i := j*64 + dst[i+63:i] := POW(2.0, a[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + dst[i+63:i] := POW(2.0, a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - l := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Elementary Math Functions + + + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 32*j - l := 64*j +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + dst[i+63:i] := POW(e, a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Elementary Math Functions + + + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - l := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + dst[i+31:i] := POW(FP32(e), a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". -FOR j := 0 to 1 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 32*j - l := 64*j +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI + dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 32*j - l := 64*j +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Elementary Math Functions + + + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*32 - l := j*64 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". -FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - l := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Elementary Math Functions + + + + + Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := InvSQRT(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + dst[i+63:i] := InvSQRT(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := InvSQRT(a[i+31:i]) ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 i := j*32 - l := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + dst[i+31:i] := InvSQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 1 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 - m := j*16 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := 0 - FI + dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - m := j*16 IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := LOG(1.0 + a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + dst[i+63:i] := LOG(1.0 + a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LOG(1.0 + a[i+31:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + dst[i+31:i] := LOG(1.0 + a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 16*j - l := 32*j + i := j*64 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 7 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := src[i+15:i] - FI + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 16*j - l := 32*j + i := j*64 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + dst[i+63:i] := LOG(a[i+63:i]) ELSE - dst[i+15:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 7 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := 0 - FI +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 16*j - l := 32*j +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + dst[i+31:i] := LOG(a[i+31:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Elementary Math Functions + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 3 - i := 16*j - l := 32*j +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Elementary Math Functions + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 3 - i := 16*j - l := 32*j +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE - dst[i+15:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Elementary Math Functions + + + + + + Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 16*j - l := 32*j +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) ELSE - dst[i+15:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". -FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Elementary Math Functions + + + + + Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + dst[i+63:i] := (1.0 / a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst". -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 32*j +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + dst[i+31:i] := (1.0 / a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Elementary Math Functions + + + + + Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := CDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + dst[i+63:i] := CDFNormal(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Probability/Statistics + + + + + Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := CDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + dst[i+31:i] := CDFNormal(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := InverseCDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + dst[i+63:i] := InverseCDFNormal(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Probability/Statistics + + + + + Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := InverseCDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + dst[i+31:i] := InverseCDFNormal(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Probability/Statistics + + + + + Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ERF(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + dst[i+63:i] := ERF(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := 1.0 - ERF(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + dst[i+63:i] := 1.0 - ERF(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Probability/Statistics + + + + + Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ERF(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + dst[i+31:i] := ERF(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+63:i] := 1.0 - ERF(a[i+31:i]) ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + dst[i+63:i] := 1.0 - ERF(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Probability/Statistics + + + + + Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := 1.0 / ERF(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + dst[i+63:i] := 1.0 / ERF(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Probability/Statistics + + + + + Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+63:i] := 1.0 / ERF(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + dst[i+63:i] := 1.0 / ERF(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Probability/Statistics + + + + + Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Probability/Statistics + + + + + Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Probability/Statistics + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := CEIL(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 32*j +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + dst[i+63:i] := CEIL(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". -FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + dst[i+31:i] := CEIL(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Special Math Functions + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := FLOOR(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + dst[i+63:i] := FLOOR(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 32*j +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + dst[i+31:i] := FLOOR(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL - AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 + Special Math Functions + + + + + Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst". + FOR j := 0 to 7 i := j*64 - l := j*32 - dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l]) + dst[i+63:i] := NearbyInt(a[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 +
immintrin.h
+ Special Math Functions +
+ + + + + + Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l]) + dst[i+63:i] := NearbyInt(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] - FI + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512VL + Special Math Functions + + + + + Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := NearbyInt(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 +
immintrin.h
+ Special Math Functions +
+ + + + + + Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + dst[i+31:i] := NearbyInt(a[i+31:i]) ELSE - dst[i+63:i] := 0 - FI + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 +
immintrin.h
+ Special Math Functions +
+ + + + Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". + FOR j := 0 to 7 i := j*64 - l := j*32 - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + dst[i+63:i] := RoundToNearestEven(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 +
immintrin.h
+ Special Math Functions +
+ + + + + + Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*64 - l := j*32 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + dst[i+63:i] := RoundToNearestEven(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] - FI + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer - AVX512VL AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI +
immintrin.h
+ Special Math Functions +
+ + + + Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RoundToNearestEven(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Arithmetic - - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j +
immintrin.h
+ Special Math Functions +
+ + + + + + Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] + dst[i+31:i] := RoundToNearestEven(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Special Math Functions + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ROUND(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Arithmetic - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -FOR j := 0 to 3 - i := 64*j +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] + dst[i+63:i] := ROUND(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Special Math Functions + + + + + Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := TRUNCATE(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Arithmetic - - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j +
immintrin.h
+ Special Math Functions +
+ + + + + + Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] + dst[i+63:i] := TRUNCATE(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Special Math Functions + + + + + Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := TRUNCATE(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Arithmetic - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j +
immintrin.h
+ Special Math Functions +
+ + + + + + Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] + dst[i+31:i] := TRUNCATE(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Special Math Functions + + + + + + Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 15 + i := 32*j + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F +
immintrin.h
Arithmetic - - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := 32*j IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Arithmetic + + + + + + Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 63 + i := 8*j + IF b[i+7:i] == 0 + #DE + FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F +
immintrin.h
Arithmetic - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 31 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := 0 + i := 64*j + IF b[i+63:i] == 0 + #DE FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Arithmetic + + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F +
immintrin.h
Arithmetic - - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 +
+ + + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 i := 32*j IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Arithmetic + + + + + + Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 63 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F +
immintrin.h
Arithmetic - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 31 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 7 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -FOR j := 0 to 3 +FOR j := 0 to 15 i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := 0 + IF b[i+31:i] == 0 + #DE FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Miscellaneous - - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 15 + i := 32*j IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL AVX512F - Load - - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] +FOR j := 0 to 63 + i := 8*j + IF b[i+7:i] == 0 + #DE FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Arithmetic + + + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 31 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Miscellaneous - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". -m := 0 -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := 64*j + IF b[i+63:i] == 0 + #DE + FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := 32*j IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + AVX512F
immintrin.h
-
- - Floating Point - AVX512VL + Arithmetic + + + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 63 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + AVX512F - Load - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 31 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 7 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -m := 0 -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 3 - i := j*32 + i := 64*j IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 + dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Load - - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 64*j IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 + dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Miscellaneous - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 3 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 + dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 3 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 + dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F - Miscellaneous - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F - Miscellaneous - - - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - + + + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F - Miscellaneous - - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + + + AVX512F - Miscellaneous - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + + + AVX512F - Miscellaneous - - - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + + + AVX512F - Miscellaneous - - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F - Miscellaneous - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN: j := 0 - SNAN_TOKEN: j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + FOR j := 0 to 3 i := j*64 - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F - Miscellaneous - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0 - + + + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F - Miscellaneous - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Miscellaneous - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F - Miscellaneous - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F - Miscellaneous - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*32 - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F - Miscellaneous - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0 - + + + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F - Miscellaneous - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Miscellaneous - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F - Miscellaneous - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] @@ -47434,30 +44633,29 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] @@ -47465,30 +44663,29 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 @@ -47496,30 +44693,29 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] @@ -47527,30 +44723,29 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 1 i := j*64 IF k[j] IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] @@ -47558,30 +44753,29 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 @@ -47589,30 +44783,29 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] @@ -47620,30 +44813,29 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] @@ -47651,30 +44843,29 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 @@ -47682,30 +44873,29 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] @@ -47713,30 +44903,29 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] @@ -47744,30 +44933,29 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 @@ -47775,2283 +44963,1929 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 1 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + + + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := src[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := src[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := c[i+31:i] + dst[i+31:i] := src[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI -ENDFOR -dst[MAX:256] := 0 +ENDFOR +dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := c[i+31:i] + dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := src[i+63:i] FI -ENDFOR -dst[MAX:128] := 0 +ENDFOR +dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI -ENDFOR -dst[MAX:128] := 0 +ENDFOR +dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. FOR j := 0 to 7 i := j*32 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Load - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 - m := j*64 + i := j*32 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - m := j*64 +FOR j := 0 to 7 + i := j*32 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+31:i] := ABS(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ABS(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Load - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*32 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+31:i] := ABS(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 3 i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := ABS(a[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 1 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := ABS(a[i+63:i]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 1 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 1 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE - dst[i+63:i] := 0 + dst[i+63:i] :=0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 1 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 1 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 i := j*32 - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 - i := j*32 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Miscellaneous - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR dst[MAX:256] := 0 - + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Miscellaneous - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512F - Miscellaneous - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - -
immintrin.h
-
- AVX512VL - AVX512F - Miscellaneous - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512F - Miscellaneous - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 @@ -50063,19 +46897,18 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 @@ -50087,20 +46920,37 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 @@ -50112,19 +46962,18 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 @@ -50136,118 +46985,131 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 @@ -50259,19 +47121,18 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 @@ -50283,20 +47144,37 @@ FOR j := 0 to 3 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 @@ -50308,19 +47186,18 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 @@ -50332,20 +47209,37 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 @@ -50357,19 +47251,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 @@ -50381,20 +47274,19 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 @@ -50406,19 +47298,18 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Arithmetic - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 @@ -50430,24382 +47321,22616 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - - Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". FOR j := 0 to 3 i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - - Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - - Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*32 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - - Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := j*64 IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + dst[i+63:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Load - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE - dst[i+31:i] := 0 - FI + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -tmp[191:128] := a[191:128] -tmp[255:192] := a[191:128] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -tmp[191:128] := a[191:128] -tmp[255:192] := a[191:128] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+63:i] := (1.0 / a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Move - - - - - Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := (1.0 / a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Store - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := j*64 IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Integer + Arithmetic + + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Load - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := (1.0 / a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Move - - - - Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := (1.0 / a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Load - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := (1.0 / a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Move - - - - - Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := (1.0 / a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 3 i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI + dst[i+31:i] := (1.0 / a[i+31:i]) ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 3 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Move - - - - Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 3 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Move - - - - - Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := j*32 IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Move - - - - Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Move - - - - - Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Move - - - - Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Load - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 - FI + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst". + +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (32*imm8[2:0]) +dst[255:0] := temp[255:0] +dst[MAX:256] := 0 + + AVX512F - Load - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (32*imm8[2:0]) +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := temp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (32*imm8[2:0]) +FOR j := 0 to 7 i := j*32 IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + dst[i+31:i] := temp[i+31:i] + ELSE + dst[i+31:i] := 0 FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst". + +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (32*imm8[1:0]) +dst[127:0] := temp[127:0] +dst[MAX:128] := 0 + + AVX512F - Load - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (32*imm8[1:0]) FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := temp[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (32*imm8[1:0]) FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+31:i] := temp[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst". + +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (64*imm8[1:0]) +dst[255:0] := temp[255:0] +dst[MAX:256] := 0 + + AVX512F - Store - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (64*imm8[1:0]) FOR j := 0 to 3 i := j*64 IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + dst[i+63:i] := temp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (64*imm8[1:0]) FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst". + +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (64*imm8[0]) +dst[127:0] := temp[127:0] +dst[MAX:128] := 0 + + AVX512F - Load - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (64*imm8[0]) FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (64*imm8[0]) FOR j := 0 to 1 i := j*64 IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + dst[i+63:i] := temp[i+63:i] + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := b[i+63:i] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -tmp[159:128] := a[191:160] -tmp[191:160] := a[191:160] -tmp[223:192] := a[255:224] -tmp[255:224] := a[255:224] -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+63:i] := b[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -tmp[159:128] := a[191:160] -tmp[191:160] := a[191:160] -tmp[223:192] := a[255:224] -tmp[255:224] := a[255:224] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := b[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := b[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI + n := (j % 4)*32 + dst[i+31:i] := a[n+31:n] ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -tmp[159:128] := a[159:128] -tmp[191:160] := a[159:128] -tmp[223:192] := a[223:192] -tmp[255:224] := a[223:192] FOR j := 0 to 7 i := j*32 + n := (j % 4)*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -tmp[159:128] := a[159:128] -tmp[191:160] := a[159:128] -tmp[223:192] := a[223:192] -tmp[255:224] := a[223:192] FOR j := 0 to 7 i := j*32 + n := (j % 4)*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*32 + n := (j % 4)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Move - - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 + n := (j % 4)*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI -ENDFOR -dst[MAX:128] := 0 +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Move - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 + n := (j % 4)*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+31:i] := a[31:0] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+31:i] := a[31:0] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 3 + i := j*32 IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Load - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Load - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Store - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Load - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Load - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. + Miscellaneous + + + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := a[31:0] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Load - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Miscellaneous + + + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +size := 64 +m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR +dst[255:m] := src[255:m] dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +size := 64 +m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := 0 + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR +dst[255:m] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +size := 64 +m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR +dst[127:m] := src[127:m] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +size := 64 +m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := 0 + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR +dst[127:m] := 0 dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +size := 32 +m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := 0 + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR +dst[255:m] := src[255:m] dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Arithmetic - - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Miscellaneous + + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +size := 32 +m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR +dst[255:m] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Arithmetic - - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +size := 32 +m := 0 FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR +dst[127:m] := src[127:m] dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +size := 32 +m := 0 FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := 0 + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR +dst[127:m] := 0 dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ABS(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Arithmetic - - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) + dst[i+63:i] := a[m+63:m] + m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) + dst[i+63:i] := a[m+63:m] + m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ABS(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Arithmetic - - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) + dst[i+63:i] := a[m+63:m] + m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) + dst[i+63:i] := a[m+63:m] + m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := a[m+31:m] + m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := a[m+31:m] + m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := a[m+31:m] + m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := a[m+31:m] + m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 + + AVX512F - Arithmetic - - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+63:i] :=0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Logical - - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Logical - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + Miscellaneous + + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN: j := 0 + SNAN_TOKEN: j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Logical - - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Logical - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI + i := j*64 + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Logical - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Logical - - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - + Miscellaneous + + + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Logical - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Logical - - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Logical - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + Miscellaneous + + + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - +
+ + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} FOR j := 0 to 7 i := j*32 - IF k[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Miscellaneous - - - - - Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Miscellaneous - - - - - Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F Miscellaneous - - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - + + + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Set - - - - - Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Miscellaneous - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Set - - - - Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + Miscellaneous + + + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Set - - - - - Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F Miscellaneous - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} FOR j := 0 to 3 i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Set - - - - Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := a[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Set - - - - - Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := a[63:0] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[63:0] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Set - - - - Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[63:0] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 +
+ + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Set - - - - - Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[63:0] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 +
+ + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[63:0] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Set - - - - Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := a[63:0] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Compare - - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := 0 FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 7 i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 7 i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 7 i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 7 i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + FOR j := 0 to 3 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*64 + IF k[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +size := 32 +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI ENDFOR -k[MAX:4] := 0 +dst[255:m] := src[255:m] +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 +size := 32 +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI ENDFOR -k[MAX:4] := 0 +dst[255:m] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 32 +m := 0 FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR -k[MAX:4] := 0 +dst[127:m] := src[127:m] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +size := 32 +m := 0 FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR -k[MAX:4] := 0 +dst[127:m] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +size := 64 +m := 0 FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR -k[MAX:4] := 0 +dst[255:m] := src[255:m] +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +size := 64 +m := 0 FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR -k[MAX:4] := 0 +dst[255:m] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". -FOR j := 0 to 3 +size := 64 +m := 0 +FOR j := 0 to 1 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR -k[MAX:4] := 0 +dst[127:m] := src[127:m] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. -FOR j := 0 to 3 +size := 64 +m := 0 +FOR j := 0 to 1 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR -k[MAX:4] := 0 +dst[127:m] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + dst[i+31:i] := a[id+31:id] ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] + FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] + FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] + FI ENDFOR -k[MAX:2] := 0 +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Integer - Mask + Miscellaneous + + + + + + + + Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] +ENDFOR +dst[MAX:128] := 0 + + + AVX512F - Compare - - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 1 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + off := idx[i+1:i]*64 + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set) FOR j := 0 to 1 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] FI ENDFOR -k[MAX:2] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] FI ENDFOR -k[MAX:2] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:2] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". FOR j := 0 to 1 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + off := idx[i]*64 + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ENDFOR -k[MAX:2] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL - AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + Miscellaneous + + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL - AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + Miscellaneous + + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512F - Compare - - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL - AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Miscellaneous + + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". FOR j := 0 to 7 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + off := idx[i+2:i]*32 + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 FI ENDFOR -k[MAX:8] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". FOR j := 0 to 3 i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + off := idx[i+1:i]*32 + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + i := j*64 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + i := j*64 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + i := j*64 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + i := j*64 + off := idx[i+1:i]*64 + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +FOR j := 0 to 1 i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +FOR j := 0 to 1 i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +FOR j := 0 to 1 i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +FOR j := 0 to 1 i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) +tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) +tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) +tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Integer - Mask + Miscellaneous + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) +tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) +tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) +tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + id := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + id := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL - AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + Miscellaneous + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + id := idx[i+1:i]*64 + dst[i+63:i] := a[id+63:id] ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + dst[i+31:i] := a[id+31:id] ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - -
immintrin.h
-
- - Integer - Mask AVX512VL - AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" across lanes lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + id := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + id := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + id := idx[i+1:i]*64 + dst[i+63:i] := a[id+63:id] ENDFOR -k[MAX:2] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
+ + + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[255:m] := src[255:m] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 32 -m := base_addr -FOR j := 0 to 7 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
+ + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := 0 FI ENDFOR -dst[255:m] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
+ + + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[127:m] := src[127:m] dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 32 -m := base_addr -FOR j := 0 to 3 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
+ + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := 0 FI ENDFOR -dst[127:m] := 0 dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
+ + + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[255:m] := src[255:m] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 3 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
+ + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := 0 FI ENDFOR -dst[255:m] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
+ + + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[127:m] := src[127:m] dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 1 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
+ + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := 0 FI ENDFOR -dst[127:m] := 0 dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 - id := idx[i+2:i]*32 IF k[j] - dst[i+31:i] := a[id+31:id] + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). +
+ + + + + + + Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +FOR j := 0 to 3 i := j*32 - off := idx[i+2:i]*32 IF k[j] - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := idx[i+31:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +FOR j := 0 to 3 i := j*32 - off := idx[i+2:i]*32 IF k[j] - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 - off := idx[i+2:i]*32 IF k[j] - dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - - - Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 i := j*32 - off := idx[i+1:i]*32 IF k[j] - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := idx[i+31:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 - off := idx[i+1:i]*32 IF k[j] - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 - off := idx[i+1:i]*32 IF k[j] - dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + + + + + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 - off := idx[i+1:i]*64 IF k[j] - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 - off := idx[i+1:i]*64 IF k[j] - dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set) - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + + + + + + + + Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 - off := idx[i]*64 IF k[j] - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 - off := idx[i]*64 IF k[j] - dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := idx[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + + + + + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 - off := idx[i+2:i]*32 IF k[j] - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 - off := idx[i+2:i]*32 IF k[j] - dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 - off := idx[i+1:i]*32 IF k[j] - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := idx[i+31:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 - off := idx[i+1:i]*32 IF k[j] - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 + i := j*64 IF k[j] - dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 - off := idx[i+1:i]*64 IF k[j] - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+63:i] := idx[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 i := j*64 - off := idx[i+1:i]*64 IF k[j] - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 i := j*64 - off := idx[i+1:i]*64 IF k[j] - dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". +
+ + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:256] := 0 - - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Miscellaneous - - - - - - Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Miscellaneous - - - - - - Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Miscellaneous - - - - - - Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := 0 +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] FI -ENDFOR -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Miscellaneous - - - - - Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI + RETURN tmp[63:0] +} FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} FOR j := 0 to 3 i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI RETURN tmp[31:0] } -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) -tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) -tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) -tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI RETURN tmp[31:0] } -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI RETURN tmp[31:0] } -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) -tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) -tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) -tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) FOR j := 0 to 7 i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI RETURN tmp[31:0] } -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI RETURN tmp[31:0] } -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI RETURN tmp[31:0] } -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] } -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) FOR j := 0 to 3 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] } -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - id := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] +
+ + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+63:i] := src[i+63:i] + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] } -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 1 i := j*64 - id := idx[i+1:i]*64 IF k[j] - dst[i+63:i] := a[id+63:id] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 3 +
+ + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 1 i := j*64 - id := idx[i+1:i]*64 - dst[i+63:i] := a[id+63:id] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 7 i := j*32 - id := idx[i+2:i]*32 IF k[j] - dst[i+31:i] := a[id+31:id] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 7 i := j*32 - id := idx[i+2:i]*32 IF k[j] - dst[i+31:i] := a[id+31:id] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". - +
+ + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 7 i := j*32 - id := idx[i+2:i]*32 - dst[i+31:i] := a[id+31:id] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle 64-bit integers in "a" across lanes lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] } -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 3 - i := j*64 - id := idx[i+1:i]*64 + i := j*32 IF k[j] - dst[i+63:i] := a[id+63:id] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] +
+ + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] } -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 - i := j*64 + i := j*32 + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - id := idx[i+1:i]*64 +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := a[id+63:id] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". + +dst.m128[0] := a.m128[imm8[0]] +dst.m128[1] := b.m128[imm8[1]] +dst[MAX:256] := 0 + + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Miscellaneous - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + + + + + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] FOR j := 0 to 3 i := j*64 - id := idx[i+1:i]*64 - dst[i+63:i] := a[id+63:id] + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 7 - i := j*32 +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR +dst.m128[0] := a.m128[imm8[0]] +dst.m128[1] := b.m128[imm8[1]] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Load - - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F Miscellaneous - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 +dst.m128[0] := a.m128[imm8[0]] +dst.m128[1] := b.m128[imm8[1]] +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". + +dst.m128[0] := a.m128[imm8[0]] +dst.m128[1] := b.m128[imm8[1]] +dst[MAX:256] := 0 + + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Miscellaneous - - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + + Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F +
immintrin.h
Miscellaneous - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 1 - i := j*64 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) FOR j := 0 to 3 - i := j*64 - m := j*32 + i := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 i := j*64 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 - i := j*32 - m := j*64 + i := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:64] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Load - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 i := j*64 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Arithmetic - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Arithmetic - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 3 i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 3 i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Arithmetic - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Arithmetic - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 7 i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 7 i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 3 i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 3 i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + AVX512F - Arithmetic - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + AVX512F - Arithmetic - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+31:i]) -ENDFOR -dst[MAX:64] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Convert - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Convert - Store - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + Compare + + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) - FI +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 3 - i := 32*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+31:i]) + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+31:i]) +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + AVX512F - Convert - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 3 - i := 32*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+31:i]) + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) - FI + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+63:i]) + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+63:i]) +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 64*j - k := 32*j - dst[k+31:k] := Truncate32(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Convert - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) - FI + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 1 - i := 64*j - k := 32*j - dst[k+31:k] := Truncate32(a[i+63:i]) + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) - FI + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 3 - i := 64*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+63:i]) +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:64] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:64] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := 64*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+63:i]) + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:32] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:32] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:32] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + AVX512F - Convert - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+31:i]) + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) - FI + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 3 - i := 32*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+31:i]) +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:32] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:32] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:32] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+31:i]) + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:8] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Integer + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + AVX512F - Convert - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 3 - i := 32*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+31:i]) + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) - FI + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+63:i]) + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+63:i]) +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:16] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:16] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:16] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 64*j - k := 32*j - dst[k+31:k] := Saturate32(a[i+63:i]) + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + AVX512F - Convert - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) - FI + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 1 - i := 64*j - k := 32*j - dst[k+31:k] := Saturate32(a[i+63:i]) +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) - FI +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 64*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+63:i]) + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+63:i]) +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) - FI + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:32] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. FOR j := 0 to 7 i := j*32 - l := j*16 - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ELSE - dst[i+31:i] := src[i+31:i] + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 - FI + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. FOR j := 0 to 3 i := j*32 - l := j*16 - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ELSE - dst[i+31:i] := src[i+31:i] + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 - FI + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ELSE - dst[i+63:i] := src[i+63:i] + k[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 - FI + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:256] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ELSE - dst[i+63:i] := src[i+63:i] + k[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 - FI + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+31:i]) -ENDFOR -dst[MAX:64] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ELSE - dst[l+7:l] := src[l+7:l] + k[j] := 0 FI ENDFOR -dst[MAX:64] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) - FI + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ELSE - dst[l+7:l] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:64] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. FOR j := 0 to 3 - i := 32*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+31:i]) + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ELSE - dst[l+7:l] := src[l+7:l] + k[j] := 0 FI ENDFOR -dst[MAX:32] := 0 +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) - FI + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ENDFOR +k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ELSE - dst[l+7:l] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:32] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. -FOR j := 0 to 7 - i := 32*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+31:i]) +FOR j := 0 to 1 + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:2] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := 32*j - l := 16*j +size := 64 +m := base_addr +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] + MEM[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert + AVX512VL +
immintrin.h
Store +
+ - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := 32*j - l := 16*j +size := 64 +m := base_addr +FOR j := 0 to 1 + i := j*64 IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) + MEM[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +size := 32 +m := base_addr FOR j := 0 to 7 - i := 32*j - l := 16*j + i := j*32 IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 + MEM[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+31:i]) -ENDFOR -dst[MAX:64] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +size := 32 +m := base_addr FOR j := 0 to 3 - i := 32*j - l := 16*j + i := j*32 IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] + MEM[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert + AVX512VL +
immintrin.h
Store +
+ - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. FOR j := 0 to 3 - i := 32*j - l := 16*j + i := j*64 IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 3 - i := 32*j - l := 16*j +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+63:i]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI ENDFOR -dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. FOR j := 0 to 3 - i := 64*j - l := 8*j + i := j*32 IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR -dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert + AVX512VL +
immintrin.h
Store +
+ - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 3 - i := 64*j - l := 8*j +FOR j := 0 to 7 + i := j*32 IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. FOR j := 0 to 3 - i := 64*j - l := 8*j + i := j*32 IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR -dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+63:i]) -ENDFOR -dst[MAX:16] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 1 - i := 64*j - l := 8*j +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert + AVX512VL +
immintrin.h
Store +
+ - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. FOR j := 0 to 1 - i := 64*j - l := 8*j + i := j*64 IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:16] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 32*j - dst[k+31:k] := SaturateU32(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Store + + + + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := 64*j - l := 32*j +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert + AVX512VL +
immintrin.h
Store +
+ - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 3 - i := 64*j - l := 32*j + i := j*32 IF k[j] - MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 3 - i := 64*j - l := 32*j + i := j*64 IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 32*j - dst[k+31:k] := SaturateU32(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 1 - i := 64*j - l := 32*j + i := j*64 IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert + AVX512VL +
immintrin.h
Store +
+ - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 1 - i := 64*j - l := 32*j +FOR j := 0 to 3 + i := j*64 IF k[j] - MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 1 - i := 64*j - l := 32*j + i := j*64 IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := 64*j - l := 16*j +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert + AVX512VL +
immintrin.h
Store +
+ - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 3 - i := 64*j - l := 16*j + i := j*32 IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 3 - i := 64*j - l := 16*j +size := 32 +m := base_addr +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 + MEM[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+63:i]) -ENDFOR -dst[MAX:32] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 1 - i := 64*j - l := 16*j +size := 32 +m := base_addr +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] + MEM[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR -dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert + AVX512VL +
immintrin.h
Store +
+ - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 1 - i := 64*j - l := 16*j +size := 64 +m := base_addr +FOR j := 0 to 3 + i := j*64 IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) + MEM[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +size := 64 +m := base_addr FOR j := 0 to 1 - i := 64*j - l := 16*j + i := j*64 IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 + MEM[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR -dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 - i := 32*j - l := 8*j + i := j*32 + m := j*32 IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Zero extend packed unsigned 8-bit integers in th elow 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := 32*j - l := 8*j + i := j*32 + m := j*32 IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := 64*j - l := 8*j + i := j*64 + m := j*32 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 1 - i := 64*j - l := 8*j + i := j*64 + m := j*32 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := 64*j - l := 32*j + i := j*32 + m := j*64 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 1 - i := 64*j - l := 32*j + i := j*32 + m := j*64 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 3 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 - i := 32*j - l := 16*j +FOR j := 0 to 3 + i := j*64 + m := j*64 IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 1 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 3 - i := 32*j - l := 16*j +FOR j := 0 to 1 + i := j*64 + m := j*64 IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - - Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Convert - - - - Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := 64*j - l := 16*j + i := j*64 + m := j*32 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + AVX512F - Convert - - - - - Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 1 - i := 64*j - l := 16*j + i := j*64 + m := j*32 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := src[i+63:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + AVX512F - Convert - - - - Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 1 - i := 64*j - l := 16*j +FOR j := 0 to 7 + i := j*32 + m := j*32 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + AVX512F - Arithmetic - - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := j*64 + i := j*32 + m := j*32 IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := src[i+63:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + AVX512F - Arithmetic - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 i := j*64 + m := j*64 IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 1 i := j*64 + m := j*64 IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 - IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 + m := j*64 IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR -dst[MAX:256] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Integer + Store + + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + AVX512F - Arithmetic - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 +FOR j := 0 to 1 i := j*32 + m := j*64 IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR -dst[MAX:512] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Integer + Store + + + + + + Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512F AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX512F - Arithmetic - - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 +MEM[mem_addr+127:mem_addr] := a[127:0] - + + AVX512F + AVX512VL
immintrin.h
-
- - Integer + Store + + + + + + Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512F AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + AVX512F - Arithmetic - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 +MEM[mem_addr+255:mem_addr] := a[255:0] - + + AVX512F + AVX512VL
immintrin.h
-
- - Integer + Store + + + + + + Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512F AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + AVX512F - Arithmetic - - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := j*32 + m := j*64 IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[m+63:m] := src[m+63:m] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := j*32 + m := j*64 IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[m+63:m] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := j*32 + m := j*64 IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[m+63:m] := src[m+63:m] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := j*32 + m := j*64 IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[m+63:m] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := j*32 + l := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := j*32 + l := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 32*j + l := 64*j IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 + l := j*64 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 32*j + l := 64*j IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*32 + l := j*64 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} FOR j := 0 to 3 - i := j*64 + i := j*32 + l := j*64 IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} FOR j := 0 to 3 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} FOR j := 0 to 1 - i := j*64 + i := j*32 + l := j*64 IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} FOR j := 0 to 1 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} FOR j := 0 to 7 i := j*32 + m := j*16 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} FOR j := 0 to 7 i := j*32 + m := j*16 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} FOR j := 0 to 3 i := j*32 + m := j*16 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} FOR j := 0 to 3 i := j*32 + m := j*16 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := 32*j IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 7 + i := 16*j + l := 32*j IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 7 + i := 16*j + l := 32*j IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) ELSE - dst[i+63:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +FOR j := 0 to 7 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := 0 + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} FOR j := 0 to 7 - i := j*32 + i := 16*j + l := 32*j IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 16*j + l := 32*j IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) ELSE - dst[i+31:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +FOR j := 0 to 3 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := src[i+15:i] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} FOR j := 0 to 3 - i := j*32 + i := 16*j + l := 32*j IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} FOR j := 0 to 3 - i := j*32 + i := 16*j + l := 32*j IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) ELSE - dst[i+31:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := 32*j IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := 32*j IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 3 + i := 32*j IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 3 + i := 32*j IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 32*j + l := 64*j IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 32*j + l := 64*j IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 32*j + l := 64*j IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 32*j + l := 64*j IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} FOR j := 0 to 3 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} FOR j := 0 to 3 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} FOR j := 0 to 1 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} FOR j := 0 to 1 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Store - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 - m := j*32 + i := 32*j IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 - m := j*32 + i := 32*j IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] +FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - m := j*32 +FOR j := 0 to 7 + i := 32*j IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Store - - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - m := j*32 +FOR j := 0 to 7 + i := 32*j IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". FOR j := 0 to 3 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 - m := j*64 + i := 32*j IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Store - - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*32 - m := j*64 +FOR j := 0 to 3 + i := 32*j IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 3 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + l := j*32 + dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l]) ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - m := j*64 + l := j*32 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI + dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + l := j*32 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 1 i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI + l := j*32 + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := j*64 + l := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 7 + i := 32*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[l+7:l] := Truncate8(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 7 - i := j*32 + i := 32*j + l := 8*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := 32*j + l := 8*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI + dst[l+7:l] := Truncate8(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+31:i]) +ENDFOR +dst[MAX:32] := 0 + + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 32*j + l := 8*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI + dst[l+7:l] := Truncate8(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 32*j + l := 8*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 32*j + l := 8*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI + dst[l+7:l] := Truncate8(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := 32*j + l := 16*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI + dst[l+15:l] := Truncate16(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := 32*j + l := 16*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := 32*j + l := 16*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI + dst[l+15:l] := Truncate16(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 32*j + l := 16*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI + dst[l+15:l] := Truncate16(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 3 - i := j*64 + i := 32*j + l := 16*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 32*j + l := 16*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI + dst[l+15:l] := Truncate16(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 8*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI + dst[l+7:l] := Truncate8(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 3 + i := 64*j + l := 8*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 3 + i := 64*j + l := 8*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI + dst[l+7:l] := Truncate8(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI + i := 64*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+63:i]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 8*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI + dst[l+7:l] := Truncate8(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 8*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) + FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 8*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI + dst[l+7:l] := Truncate8(a[i+63:i]) ELSE - dst[i+31:i] := 0 - FI + dst[l+7:l] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI + i := 64*j + k := 32*j + dst[k+31:k] := Truncate32(a[i+63:i]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 64*j + l := 32*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI + dst[l+31:l] := Truncate32(a[i+63:i]) ELSE - dst[i+31:i] := 0 - FI + dst[l+31:l] := src[l+31:l] + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 32*j IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI + MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) + FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 32*j IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI + dst[l+31:l] := Truncate32(a[i+63:i]) ELSE - dst[i+63:i] := 0 - FI + dst[l+31:l] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI + i := 64*j + k := 32*j + dst[k+31:k] := Truncate32(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 32*j IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI + dst[l+31:l] := Truncate32(a[i+63:i]) ELSE - dst[i+63:i] := 0 - FI + dst[l+31:l] := src[l+31:l] + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 32*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI + dst[l+31:l] := Truncate32(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 64*j + l := 16*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI + dst[l+15:l] := Truncate16(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 64*j + l := 16*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 64*j + l := 16*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI + dst[l+15:l] := Truncate16(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+15:l] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+63:i]) +ENDFOR +dst[MAX:32] := 0 + + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 16*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI + dst[l+15:l] := Truncate16(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 16*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 16*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI + dst[l+15:l] := Truncate16(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[l+15:l] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 8*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI + dst[l+7:l] := Saturate8(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 8*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 8*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI + dst[l+7:l] := Saturate8(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+31:i]) +ENDFOR +dst[MAX:32] := 0 + + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 32*j + l := 8*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI + dst[l+7:l] := Saturate8(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 3 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+31:i]) ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 16*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI + dst[l+15:l] := Saturate16(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 16*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 16*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI + dst[l+15:l] := Saturate16(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 3 + i := 32*j + l := 16*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI + dst[l+15:l] := Saturate16(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 1 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+31:i]) ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + dst[l+15:l] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 3 + i := 64*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 64*j + l := 8*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI + dst[l+7:l] := Saturate8(a[i+63:i]) ELSE - dst[i+31:i] := 0 - FI + dst[l+7:l] := src[l+7:l] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 3 - i := j*32 + i := 64*j + l := 8*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) + FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 64*j + l := 8*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI + dst[l+7:l] := Saturate8(a[i+63:i]) ELSE - dst[i+31:i] := 0 - FI + dst[l+7:l] := 0 + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI +FOR j := 0 to 1 + i := 64*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+63:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 1 + i := 64*j + l := 8*j IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI + dst[l+7:l] := Saturate8(a[i+63:i]) ELSE - dst[i+63:i] := 0 - FI + dst[l+7:l] := src[l+7:l] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 3 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 8*j IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI + dst[l+7:l] := Saturate8(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[l+7:l] := 0 + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:16] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := 0 - FI +FOR j := 0 to 3 + i := 64*j + k := 32*j + dst[k+31:k] := Saturate32(a[i+63:i]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Saturate32(a[i+63:i]) ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 64*j + l := 32*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] + MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 64*j + l := 32*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI + dst[l+31:l] := Saturate32(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 32*j + dst[k+31:k] := Saturate32(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 32*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI + dst[l+31:l] := Saturate32(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 32*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 32*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI + dst[l+31:l] := Saturate32(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 64*j + l := 16*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI + dst[l+15:l] := Saturate16(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 3 - i := j*32 + i := 64*j + l := 16*j IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 64*j + l := 16*j IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI + dst[l+15:l] := Saturate16(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[l+15:l] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+63:i]) +ENDFOR +dst[MAX:32] := 0 + + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 1 + i := 64*j + l := 16*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI + dst[l+15:l] := Saturate16(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 1 + i := 64*j + l := 16*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 1 + i := 64*j + l := 16*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI + dst[l+15:l] := Saturate16(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 8*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI + dst[i+31:i] := SignExtend32(a[l+7:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 8*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI + dst[i+31:i] := SignExtend32(a[l+7:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 3 + i := 32*j + l := 8*j IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI + dst[i+31:i] := SignExtend32(a[l+7:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 3 + i := 32*j + l := 8*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI + dst[i+31:i] := SignExtend32(a[l+7:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 64*j + l := 8*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := SignExtend64(a[l+7:l]) ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 3 + i := 64*j + l := 8*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := SignExtend64(a[l+7:l]) ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 8*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := SignExtend64(a[l+7:l]) ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 8*j IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := SignExtend64(a[l+7:l]) ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 32*j IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := SignExtend64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 32*j IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := SignExtend64(a[l+31:l]) ELSE dst[i+63:i] := 0 - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 32*j IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := SignExtend64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 32*j IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := SignExtend64(a[l+31:l]) ELSE dst[i+63:i] := 0 - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 + l := j*16 IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := SignExtend32(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := 32*j + l := 16*j IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := SignExtend32(a[l+15:l]) ELSE dst[i+31:i] := 0 - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 + l := j*16 IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := SignExtend32(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 32*j + l := 16*j IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := SignExtend32(a[l+15:l]) ELSE dst[i+31:i] := 0 - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+63:i] := SignExtend64(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+63:i] := SignExtend64(a[l+15:l]) ELSE dst[i+63:i] := 0 - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+63:i] := SignExtend64(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Arithmetic - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+63:i] := SignExtend64(a[l+15:l]) ELSE dst[i+63:i] := 0 - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := 32*j + l := 8*j IF k[j] - FOR h := 0 to 31 - index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + dst[l+7:l] := SaturateU8(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 7 - i := j*32 + i := 32*j + l := 8*j IF k[j] - FOR h := 0 to 31 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 - FOR h := 0 to 31 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+31:i]) +ENDFOR +dst[MAX:32] := 0 + + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 32*j + l := 8*j IF k[j] - FOR h := 0 to 31 - index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + dst[l+7:l] := SaturateU8(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 3 - i := j*32 + i := 32*j + l := 8*j IF k[j] - FOR h := 0 to 31 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". - -FOR j := 0 to 3 - i := j*32 - FOR h := 0 to 31 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 32*j + l := 8*j IF k[j] - FOR h := 0 to 63 - index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + dst[l+7:l] := SaturateU8(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 16*j IF k[j] - FOR h := 0 to 63 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + dst[l+15:l] := SaturateU16(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". - -FOR j := 0 to 3 - i := j*64 - FOR h := 0 to 63 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 16*j IF k[j] - FOR h := 0 to 63 - index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR - ELSE - dst[i+63:i] := src[i+63:i] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) FI ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 7 + i := 32*j + l := 16*j IF k[j] - FOR h := 0 to 63 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + dst[l+15:l] := SaturateU16(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - FOR h := 0 to 63 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR +FOR j := 0 to 3 + i := 32*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) ELSE - k[j] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) + FI ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) ELSE - k[j] := 0 + dst[l+15:l] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". FOR j := 0 to 3 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 + i := 64*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+63:i]) ENDFOR -k[MAX:4] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) ELSE - k[j] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 3 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) + FI ENDFOR -k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) ELSE - k[j] := 0 + dst[l+7:l] := 0 FI ENDFOR -k[MAX:2] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". FOR j := 0 to 1 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 + i := 64*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+63:i]) ENDFOR -k[MAX:2] := 0 +dst[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) ELSE - k[j] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) + FI ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) ELSE - k[j] := 0 + dst[l+7:l] := 0 FI ENDFOR -k[MAX:4] := 0 +dst[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". FOR j := 0 to 3 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 + i := 64*j + k := 32*j + dst[k+31:k] := SaturateU32(a[i+63:i]) ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) ELSE - k[j] := 0 + dst[l+31:l] := src[l+31:l] FI ENDFOR -k[MAX:4] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 3 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) + FI ENDFOR -k[MAX:4] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) ELSE - k[j] := 0 + dst[l+31:l] := 0 FI ENDFOR -k[MAX:2] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512VL + AVX512F - Compare - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". FOR j := 0 to 1 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 + i := 64*j + k := 32*j + dst[k+31:k] := SaturateU32(a[i+63:i]) ENDFOR -k[MAX:2] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 32*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[l+31:l] := SaturateU32(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) + FI +ENDFOR + + AVX512F - Miscellaneous - - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 1 + i := 64*j + l := 32*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[l+31:l] := SaturateU32(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI + i := 64*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 - i := j*32 + i := 64*j + l := 16*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[l+15:l] := SaturateU16(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) FI ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[l+15:l] := SaturateU16(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+63:i]) +ENDFOR +dst[MAX:32] := 0 + + AVX512F - Miscellaneous - - - - - - Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[l+15:l] := SaturateU16(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) + FI +ENDFOR + + AVX512F - Miscellaneous - - - - - Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[l+15:l] := SaturateU16(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in th elow 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 8*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 8*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 8*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 8*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL
immintrin.h
-
- - Integer + Convert + + + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Logical - - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := 32*j + l := 16*j IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 + i := 32*j + l := 16*j IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 32*j + l := 16*j IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*32 + i := 32*j + l := 16*j IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Logical - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Arithmetic - - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Load +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Arithmetic - - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Load +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Arithmetic - - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Load +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} FOR j := 0 to 3 i := j*64 + m := j*32 IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Miscellaneous - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + Load + + + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} FOR j := 0 to 1 i := j*64 + m := j*32 IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Miscellaneous - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + Load + + + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} FOR j := 0 to 7 i := j*32 + m := j*32 IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 + m := j*32 IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] +FOR j := 0 to 3 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] FI - RETURN tmp[31:0] -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := j*64 + m := j*64 IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} FOR j := 0 to 3 i := j*32 + m := j*64 IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*32 - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI + dst[i+31:i] := src[i+31:i] FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 3 - i := j*64 +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Miscellaneous - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI + dst[i+31:i] := 0 FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + FOR j := 0 to 3 i := j*64 - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Store - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 3 i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 3 i := j*64 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Store - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 1 i := j*64 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Store - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 7 - i := j*32 - m := j*32 +FOR j := 0 to 1 + i := j*64 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Store - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 3 - i := j*32 - m := j*32 + i := j*64 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Store - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 3 i := j*64 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Store - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 1 i := j*64 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Store - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Store - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Store - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - -
immintrin.h
-
- - Floating Point - AVX512VL - AVX512F - Store - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + Load + + + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 1 - i := j*32 - m := j*64 + i := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -dst.m128[0] := a.m128[imm8[0]] -dst.m128[1] := b.m128[imm8[1]] -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -dst.m128[0] := a.m128[imm8[0]] -dst.m128[1] := b.m128[imm8[1]] -dst[MAX:256] := 0 - - -
immintrin.h
-
- AVX512VL - AVX512F - Miscellaneous - - - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] +m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] +m := 0 FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". - -dst.m128[0] := a.m128[imm8[0]] -dst.m128[1] := b.m128[imm8[1]] -dst[MAX:256] := 0 - - -
immintrin.h
-
- AVX512VL - AVX512F - Miscellaneous - - - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] +m := 0 FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] +m := 0 FOR j := 0 to 3 - i := j*64 + i := j*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - AVX512VL + AVX512F - Miscellaneous - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". - -dst.m128[0] := a.m128[imm8[0]] -dst.m128[1] := b.m128[imm8[1]] -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Floating Point AVX512VL - AVX512F - Miscellaneous - - - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +m := 0 FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - - Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +m := 0 FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 + m := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*32 + m := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) FOR j := 0 to 3 - i := j*32 + i := j*64 + m := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 1 + i := j*64 + m := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Elementary Math Functions - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 3 - i := j*64 + i := j*32 + m := j*64 IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Elementary Math Functions - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 3 - i := j*64 +FOR j := 0 to 1 + i := j*32 + m := j*64 IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Elementary Math Functions - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*64 + m := j*64 IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Elementary Math Functions - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 1 i := j*64 + m := j*64 IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Elementary Math Functions - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR +dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Elementary Math Functions - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR +dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + AVX512F - Elementary Math Functions - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR +dst[127:0] := MEM[mem_addr+127:mem_addr] dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Load + + + + + Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512F AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + AVX512F - Elementary Math Functions - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR +dst[127:0] := MEM[mem_addr+127:mem_addr] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + AVX512F AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + AVX512F - Arithmetic - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 - FI + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Arithmetic - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 - FI + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +tmp[191:128] := a[191:128] +tmp[255:192] := a[191:128] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +tmp[191:128] := a[191:128] +tmp[255:192] := a[191:128] FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +tmp[159:128] := a[191:160] +tmp[191:160] := a[191:160] +tmp[223:192] := a[255:224] +tmp[255:224] := a[255:224] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +tmp[159:128] := a[191:160] +tmp[191:160] := a[191:160] +tmp[223:192] := a[255:224] +tmp[255:224] := a[255:224] FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512VL + AVX512F - Miscellaneous - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Store - - - - Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - + AVX512VL
immintrin.h
-
- - Integer - AVX512F - Store - - - - Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. + Move + + + + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -MEM[mem_addr+511:mem_addr] := a[511:0] +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +tmp[159:128] := a[159:128] +tmp[191:160] := a[159:128] +tmp[223:192] := a[223:192] +tmp[255:224] := a[223:192] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Store - - - - Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Store - - - - Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. + Move + + + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -MEM[mem_addr+127:mem_addr] := a[127:0] +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +tmp[159:128] := a[159:128] +tmp[191:160] := a[159:128] +tmp[223:192] := a[223:192] +tmp[255:224] := a[223:192] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Store - - - - Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - -
immintrin.h
-
- - Integer - AVX512VL - AVX512F - Store - - - - Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + Move + + + + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -MEM[mem_addr+255:mem_addr] := a[255:0] +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Store - - - - Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Store - - - - Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - -
immintrin.h
-
- - Integer - AVX512F - Load - - - Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. + Move + + + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load - - - Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer AVX512VL - AVX512F - Load - - - Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. +
immintrin.h
+ Move +
+ + + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[255:0] := MEM[mem_addr+255:mem_addr] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[255:0] := MEM[mem_addr+255:mem_addr] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[127:0] := MEM[mem_addr+127:mem_addr] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[127:0] := MEM[mem_addr+127:mem_addr] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[255:0] := MEM[mem_addr+255:mem_addr] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[255:0] := MEM[mem_addr+255:mem_addr] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[127:0] := MEM[mem_addr+127:mem_addr] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F - Load - - - Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[127:0] := MEM[mem_addr+127:mem_addr] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Logical - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". +
+ + + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Logical - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". +
+ + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Logical - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". +
+ + + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Logical - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". +
+ + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Logical - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst". +
+ + + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 3 i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Logical - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Logical - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst". +
+ + + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 1 i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + AVX512F + AVX512VL +
immintrin.h
Logical - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - VAES - Cryptography - - - - Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." - FOR j := 0 to 3 - i := j*128 - a[i+127:i] := ShiftRows(a[i+127:i]) - a[i+127:i] := SubBytes(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512F - VAES - Cryptography - - - - Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." - FOR j := 0 to 3 - i := j*128 - a[i+127:i] := ShiftRows(a[i+127:i]) - a[i+127:i] := SubBytes(a[i+127:i]) - a[i+127:i] := MixColumns(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] + Logical + + + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - VAES - Cryptography - - - - Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". - FOR j := 0 to 3 - i := j*128 - a[i+127:i] := InvShiftRows(a[i+127:i]) - a[i+127:i] := InvSubBytes(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512F - VAES - Cryptography - - - - Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". - FOR j := 0 to 3 - i := j*128 - a[i+127:i] := InvShiftRows(a[i+127:i]) - a[i+127:i] := InvSubBytes(a[i+127:i]) - a[i+127:i] := InvMixColumns(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] + Logical + + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := a[15:0] AND b[15:0] -k[MAX:16] := 0 - - + AVX512VL
immintrin.h
-
- - Mask - AVX512F - Mask - - - - Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". + Logical + + + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -k[15:0] := (NOT a[15:0]) AND b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". - -k[15:0] := NOT a[15:0] -k[MAX:16] := 0 - - + AVX512VL
immintrin.h
-
- - Mask - AVX512F - Mask - - - - Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". + Logical + + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -k[15:0] := a[15:0] OR b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := NOT (a[15:0] XOR b[15:0]) -k[MAX:16] := 0 - - + AVX512VL
immintrin.h
-
- - Mask - AVX512F - Mask - - - - Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". + Logical + + + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -k[15:0] := a[15:0] XOR b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - Shift the bits of 16-bit mask "a" left by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 15 - k[15:0] := a[15:0] << count[7:0] -FI - - + AVX512VL
immintrin.h
-
- - Mask - AVX512F - Mask - - - - Shift the bits of 16-bit mask "a" right by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k". + Logical + + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -k[MAX:0] := 0 -IF count[7:0] <= 15 - k[15:0] := a[15:0] >> count[7:0] -FI +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Load - - - Load 16-bit mask from memory into "k". - -k[15:0] := MEM[mem_addr+15:mem_addr] - - + AVX512VL
immintrin.h
-
- - Mask - AVX512F - Store - - - - Store 16-bit mask from "a" into memory. + Logical + + + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -MEM[mem_addr+15:mem_addr] := a[15:0] +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - - Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". - -tmp[15:0] := a[15:0] OR b[15:0] -IF tmp[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -IF tmp[15:0] == 0xFFFF - MEM[all_ones+7:all_ones] := 1 -ELSE - MEM[all_ones+7:all_ones] := 0 -FI - - + AVX512VL
immintrin.h
-
- - Mask - AVX512F - Mask - - - - Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + Logical + + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[15:0] := a[15:0] OR b[15:0] -IF tmp[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". - -tmp[15:0] := a[15:0] OR b[15:0] -IF tmp[15:0] == 0xFFFF - dst := 1 -ELSE - dst := 0 -FI - - + AVX512VL
immintrin.h
-
- - AVX512F - Mask - - - Convert 16-bit mask "a" into an integer value, and store the result in "dst". + Logical + + + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set). -dst := ZeroExtend32(a[15:0]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + FOR h := 0 to 31 + index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- + AVX512F - Mask - - - Convert integer value "a" into an 16-bit mask, and store the result in "k". - -k := ZeroExtend16(a[15:0]) - - + AVX512VL
immintrin.h
-
- - Mask - AVX512F - Mask - - - - Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". + Logical + + + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). -k[15:0] := (NOT a[15:0]) AND b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + FOR h := 0 to 31 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". -k[15:0] := a[15:0] AND b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 7 + i := j*32 + FOR h := 0 to 31 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - Copy 16-bit mask "a" to "k". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set). -k[15:0] := a[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + FOR h := 0 to 31 + index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). -k[15:0] := NOT a[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + FOR h := 0 to 31 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". -k[15:0] := a[15:0] OR b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + i := j*32 + FOR h := 0 to 31 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - Unpack and interleave 8 bits from masks "a" and "b", and store the 16-bit result in "k". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set). -k[7:0] := b[7:0] -k[15:8] := a[7:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + FOR h := 0 to 63 + index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). -k[15:0] := NOT (a[15:0] XOR b[15:0]) -k[MAX:16] := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + FOR h := 0 to 63 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Mask + AVX512F - Mask - - - - Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". -k[15:0] := a[15:0] XOR b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + i := j*64 + FOR h := 0 to 63 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + FOR h := 0 to 63 + index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + FOR h := 0 to 63 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Logical + + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". + +FOR j := 0 to 1 + i := j*64 + FOR h := 0 to 63 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Arithmetic - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := a[63:0] + b[63:0] -dst[127:64] := a[127:64] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] + b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] + b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] + b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] + b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := a[31:0] + b[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + Logical + + + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := a[31:0] + b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] + b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + Logical + + + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := a[31:0] + b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] + b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512F - Miscellaneous - - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and stores the low 64 bytes (16 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := temp[i+31:i] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Miscellaneous - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst". - -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) -dst[511:0] := temp[511:0] -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512F - Miscellaneous - - - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := temp[i+63:i] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Miscellaneous - - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and stores the low 64 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := temp[i+63:i] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Logical + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Swizzle - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 - n := (j % 4)*32 - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Logical + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Swizzle - - - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Logical + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Swizzle - - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst". -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - n := (j % 4)*64 - dst[i+63:i] := a[n+63:n] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Logical + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Swizzle - - - - - Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - n := (j % 4)*64 + i := j*32 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+31:i] := a[31:0] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - n := (j % 4)*64 + i := j*32 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+31:i] := a[31:0] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - n := (j % 4)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Set + + + + + + + Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 - n := (j % 4)*32 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 3 i := j*32 - n := (j % 4)*32 IF k[j] - dst[i+31:i] := a[n+31:n] + dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - n := (j % 4)*64 - dst[i+63:i] := a[n+63:n] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Set + + + + + + + Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 - n := (j % 4)*64 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 3 i := j*64 - n := (j % 4)*64 IF k[j] - dst[i+63:i] := a[n+63:n] + dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Set + + + + + + + Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] @@ -74813,21 +69938,21 @@ FOR j := 0 to 7 dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] @@ -74835,3904 +69960,3945 @@ FOR j := 0 to 7 dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". + AVX512VL +
immintrin.h
+ Set +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 i := j*32 - dst[i+31:i] := a[31:0] + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Shift + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Swizzle - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Mask + AVX512F - Compare - - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 -k[MAX:1] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F - Compare - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 -k[MAX:1] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F - Compare - - - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F - Compare - - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F - Compare - - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 -k[MAX:1] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F - Compare - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 -k[MAX:1] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Mask - AVX512F - Compare - - - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 + Shift + + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Mask + AVX512F - Compare - - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Flag - AVX512F - Compare - - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0 + Shift + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Flag + AVX512F - Compare - - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0 - - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + Shift + + + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 64 -m := 0 -FOR j := 0 to 7 +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 i := j*64 IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - Swizzle - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 64 -m := base_addr -FOR j := 0 to 7 +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 i := j*64 IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". -size := 64 -m := 0 -FOR j := 0 to 7 +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 32 -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Store - Swizzle - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + Shift + + + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 32 -m := base_addr -FOR j := 0 to 15 - i := j*32 +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 32 -m := 0 -FOR j := 0 to 15 - i := j*32 +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 FI ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - m := j*64 - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} FOR j := 0 to 7 i := j*32 - m := j*64 IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE - dst[m+63:m] := src[m+63:m] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} FOR j := 0 to 7 i := j*32 - m := j*64 IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE - dst[m+63:m] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + Shift + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - l := j*64 +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - l := j*64 +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Shift + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} FOR j := 0 to 7 - i := 32*j - l := 64*j + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} FOR j := 0 to 7 - i := 32*j - l := 64*j + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + Shift + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 i := j*32 - l := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - l := 64*j +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". -FOR j := 0 to 7 +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 i := j*32 - l := j*64 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - l := j*64 +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - l := j*64 +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Shift + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} FOR j := 0 to 7 i := j*32 - l := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} FOR j := 0 to 7 - i := 32*j - l := 64*j + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} FOR j := 0 to 7 - i := 32*j - l := 64*j + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 i := j*32 - m := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". -FOR j := 0 to 15 +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 i := j*32 - m := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - m := j*16 +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - m := j*16 +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point + Shift + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Convert - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - m := j*16 +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - m := j*16 +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + Shift + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + Shift + + + + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 32*j - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 32*j - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 32*j +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI ELSE - dst[i+15:i] := src[i+15:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 32*j +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI ELSE - dst[i+15:i] := src[i+15:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 32*j +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI ELSE - dst[i+15:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 16*j - l := 32*j +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI ELSE - dst[i+15:i] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ELSE dst[i+31:i] := src[i+31:i] - FI + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ELSE dst[i+31:i] := 0 - FI + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Convert_FP64_To_Int32(a[63:0]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP64_To_Int64(a[63:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Convert_FP64_To_Int32(a[63:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP64_To_Int64(a[63:0]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[31:0] := Convert_FP64_To_Int32(a[63:0]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := Convert_FP64_To_Int64(a[63:0]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". - [round_note] - -dst[31:0] := Convert_FP64_To_UInt32(a[63:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". - [round_note] + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP64_To_UInt64(a[63:0]) - - -
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". - -dst[31:0] := Convert_FP64_To_UInt32(a[63:0]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP64_To_UInt64(a[63:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := Convert_Int32_To_FP64(b[31:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + Shift + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 3 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + Shift + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 3 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [sae_note] + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -dst[127:64] := a[127:64] +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [sae_note] - -IF k[0] - dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF k[0] - dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [sae_note] - -IF k[0] - dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -IF k[0] - dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - [round_note] - -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - - -
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - [round_note] - -dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - - -
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - [round_note] - -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - [round_note] - -dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". - [round_note] + Shift + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". -dst[31:0] := Convert_FP32_To_UInt32(a[31:0]) +FOR j := 0 to 1 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". - [round_note] - -dst[63:0] := Convert_FP32_To_UInt64(a[31:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". + Shift + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". -dst[31:0] := Convert_FP32_To_UInt32(a[31:0]) +FOR j := 0 to 1 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_UInt64(a[31:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] + Shift + + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + Shift + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j - l := 64*j + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - l := 64*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI ELSE dst[i+31:i] := src[i+31:i] - FI + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - l := 64*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI ELSE dst[i+31:i] := 0 - FI + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - l := 64*j +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI ENDFOR dst[MAX:256] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Convert - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 32*j - l := 64*j +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+63:i] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Shift + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j - l := 64*j + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j - l := 64*j + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 32*j - l := 64*j + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + Shift + + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - [sae_note] + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - [sae_note] + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + Shift + + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". - [sae_note] + Shift + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + Shift + + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) - - + AVX512VL
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - [sae_note] + Shift + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) - - -
immintrin.h
-
- - Floating Point - Integer - AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". - [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". - [sae_note] + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 1 i := j*64 - l := j*32 - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - l := j*32 + i := j*32 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + dst[i+31:i] := SQRT(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - l := j*32 + i := j*32 IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + dst[i+31:i] := SQRT(a[i+31:i]) ELSE - dst[i+63:i] := 0 - FI + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F + AVX512VL
immintrin.h
-
- - Floating Point - Integer + Elementary Math Functions + + + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." + FOR j := 0 to 3 + i := j*128 + a[i+127:i] := ShiftRows(a[i+127:i]) + a[i+127:i] := SubBytes(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F + VAES
immintrin.h
-
- - Floating Point - Integer + Cryptography + + + + + + Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." + FOR j := 0 to 3 + i := j*128 + a[i+127:i] := ShiftRows(a[i+127:i]) + a[i+127:i] := SubBytes(a[i+127:i]) + a[i+127:i] := MixColumns(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + VAES +
immintrin.h
+ Cryptography +
+ + + + + Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". + FOR j := 0 to 3 + i := j*128 + a[i+127:i] := InvShiftRows(a[i+127:i]) + a[i+127:i] := InvSubBytes(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F + VAES
immintrin.h
-
- - Floating Point - Integer + Cryptography + + + + + + Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". + FOR j := 0 to 3 + i := j*128 + a[i+127:i] := InvShiftRows(a[i+127:i]) + a[i+127:i] := InvSubBytes(a[i+127:i]) + a[i+127:i] := InvMixColumns(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Convert - - - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + VAES +
immintrin.h
+ Cryptography +
+ + + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 15 - i := 32*j + i := j*32 IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 15 - i := 32*j + i := j*32 IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_note] -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[63:0] := a[63:0] + b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the unsigned 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] -dst[63:0] := Convert_Int32_To_FP64(b[31:0]) +IF k[0] + dst[63:0] := a[63:0] + b[63:0] +ELSE + dst[63:0] := src[63:0] +FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer + Arithmetic + + + + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] + b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Convert - - - - Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +IF k[0] + dst[63:0] := a[63:0] + b[63:0] +ELSE + dst[63:0] := 0 +FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer + Arithmetic + + + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] + b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Convert - - - - - Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[31:0] := a[31:0] + b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - - Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +IF k[0] + dst[31:0] := a[31:0] + b[31:0] +ELSE + dst[31:0] := src[31:0] +FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +IF k[0] + dst[31:0] := a[31:0] + b[31:0] +ELSE + dst[31:0] := src[31:0] +FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - Integer + AVX512F - Convert - - - - Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +IF k[0] + dst[31:0] := a[31:0] + b[31:0] +ELSE + dst[31:0] := 0 +FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] + b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F +
immintrin.h
Arithmetic - - - +
+ + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". FOR j := 0 to 7 @@ -78741,17 +73907,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - +
+ + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", =and store the results in "dst". [round_note] @@ -78761,18 +73926,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -78785,19 +73949,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - +
+ + + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_note] @@ -78811,17 +73974,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - +
+ + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -78834,18 +73996,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] @@ -78859,16 +74020,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - +
+ + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". FOR j := 0 to 15 @@ -78877,17 +74037,16 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - +
+ + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". [round_note] @@ -78897,18 +74056,17 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 @@ -78921,19 +74079,18 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - +
+ + + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_note] @@ -78947,17 +74104,16 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - +
+ + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 @@ -78970,18 +74126,17 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] @@ -78995,17 +74150,16 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - +
+ + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_note] @@ -79013,19 +74167,18 @@ dst[63:0] := a[63:0] / b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - +
+ + + + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_note] @@ -79037,18 +74190,17 @@ FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". IF k[0] @@ -79059,18 +74211,17 @@ FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_note] @@ -79082,17 +74233,16 @@ FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - +
+ + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". IF k[0] @@ -79103,17 +74253,16 @@ FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - +
+ + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] @@ -79121,19 +74270,18 @@ dst[31:0] := a[31:0] / b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - +
+ + + + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] @@ -79145,18 +74293,17 @@ FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". IF k[0] @@ -79167,18 +74314,17 @@ FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - +
+ + + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] @@ -79190,17 +74336,16 @@ FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - +
+ + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". IF k[0] @@ -79211,5335 +74356,4307 @@ FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512F - Load - Swizzle - - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Load - Swizzle - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -m := 0 FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Swizzle - - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Load - Swizzle - - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "a" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -m := 0 FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + AVX512F - Swizzle - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + [round_note] -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + AVX512F - Load - Swizzle - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + AVX512F - Swizzle - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] -CASE imm8[1:0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -2: dst[127:0] := a[383:256] -3: dst[127:0] := a[511:384] -ESAC +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] dst[MAX:128] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + AVX512F - Swizzle - - - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + [round_note] -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] dst[MAX:128] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + AVX512F - Swizzle - - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR +dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +dst[127:32] := a[127:32] dst[MAX:128] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + AVX512F - Swizzle - - - - Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". -CASE imm8[0] OF -0: dst[255:0] := a[255:0] -1: dst[255:0] := a[511:256] -ESAC -dst[MAX:256] := 0 +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + AVX512F - Swizzle - - - - - - Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 3 +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Swizzle - - - - - Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI ELSE - dst[i+63:i] := 0 + dst[i+63:i] := c[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + + AVX512F - Swizzle - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] -CASE imm8[1:0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -2: dst[127:0] := a[383:256] -3: dst[127:0] := a[511:384] -ESAC -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + + AVX512F - Swizzle - - - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := tmp[i+31:i] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + + AVX512F - Swizzle - - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := tmp[i+31:i] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI ELSE - dst[i+31:i] := 0 + dst[i+63:i] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + + AVX512F - Swizzle - - - - Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[255:0] := a[255:0] -1: dst[255:0] := a[511:256] -ESAC -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - - Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + + AVX512F - Swizzle - - - - - Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI ELSE dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Miscellaneous - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Miscellaneous - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI ELSE - dst[i+63:i] := a[i+63:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Miscellaneous - - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Miscellaneous - - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + AVX512F - Miscellaneous - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + AVX512F - Miscellaneous - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Miscellaneous - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 15 - i := j*32 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI ELSE - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Miscellaneous - - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Miscellaneous - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Miscellaneous - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F - Miscellaneous - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + AVX512F - Miscellaneous - - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + IF k[0] - dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + IF k[0] - dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + IF k[0] - dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + IF k[0] - dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + AVX512F - Miscellaneous - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) -dst[127:32] := a[127:32] +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + IF k[0] - dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + IF k[0] - dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + IF k[0] - dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F - Miscellaneous - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + IF k[0] - dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "a" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_note] -dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". [round_note] IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := c[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := c[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_note] IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_note] IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". [round_note] IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := c[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := c[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512F Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + + + AVX512F +
immintrin.h
Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] FOR j := 0 to 7 i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI + dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE - dst[i+63:i] := c[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := a[63:0] * b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] * b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] -FOR j := 0 to 7 - i := j*64 +IF k[0] + dst[63:0] := a[63:0] * b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] * b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := a[63:0] * b[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] * b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] * b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] * b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] * b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := a[31:0] * b[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE - dst[i+63:i] := a[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] +
+ + + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
+ + + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". +
+ + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - [round_note] +
+ + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". -FOR j := 0 to 15 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
+ + + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI + dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE - dst[i+31:i] := c[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] +
+ + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI + dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE - dst[i+31:i] := c[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
+ + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] +
+ + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI + dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE - dst[i+31:i] := a[i+31:i] - FI + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
+ + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE - dst[i+31:i] := 0 - FI + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 - FI + FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 - FI + FI ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512F Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + + + + + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_note] IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] + dst[63:0] := a[63:0] - b[63:0] ELSE - dst[63:0] := c[63:0] + dst[63:0] := src[63:0] FI -dst[127:64] := c[127:64] +dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512F Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] + + + + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] + dst[63:0] := a[63:0] - b[63:0] ELSE - dst[63:0] := a[63:0] + dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512F Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + + + + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_note] IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] + dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". +
+ + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] + dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
+ + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_note] -dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -dst[127:32] := a[127:32] +dst[63:0] := a[63:0] - b[63:0] +dst[127:64] := a[127:64] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". +
+ + + + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] + dst[31:0] := a[31:0] - b[31:0] ELSE - dst[31:0] := c[31:0] + dst[31:0] := src[31:0] FI -dst[127:32] := c[127:32] +dst[127:32] := a[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". +
+ + + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] + dst[31:0] := a[31:0] - b[31:0] ELSE - dst[31:0] := c[31:0] + dst[31:0] := src[31:0] FI -dst[127:32] := c[127:32] +dst[127:32] := a[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
+ + + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] + dst[31:0] := a[31:0] - b[31:0] ELSE - dst[31:0] := a[31:0] + dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
+ + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] + dst[31:0] := a[31:0] - b[31:0] ELSE - dst[31:0] := a[31:0] + dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
+ + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := 0 -FI +dst[31:0] := a[31:0] - b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
+ + + + + Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +MEM[mem_addr+511:mem_addr] := a[511:0] - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 7 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +MEM[mem_addr+511:mem_addr] := a[511:0] - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + Store 16-bit mask from "a" into memory. + +MEM[mem_addr+15:mem_addr] := a[15:0] + + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - [round_note] +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +size := 64 +m := base_addr FOR j := 0 to 7 i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + IF k[j] + MEM[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR -dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 7 - i := j*64 +size := 32 +m := base_addr +FOR j := 0 to 15 + i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] + MEM[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR -dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR -dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + Store 512-bits of integer data from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 7 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + Store 512-bits of integer data from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + AVX512F - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +MEM[mem_addr+511:mem_addr] := a[511:0] - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + Store the lower double-precision (64-bit) floating-point element from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + MEM[mem_addr+63:mem_addr] := a[63:0] +FI - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + + Store the lower single-precision (32-bit) floating-point element from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + MEM[mem_addr+31:mem_addr] := a[31:0] +FI + + AVX512F - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 7 i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 15 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +MEM[mem_addr+511:mem_addr] := a[511:0] - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - [round_note] +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 15 i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR -dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +size := 32 +m := base_addr FOR j := 0 to 15 i := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] + MEM[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR -dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 15 - i := j*32 +size := 64 +m := base_addr +FOR j := 0 to 7 + i := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] + MEM[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR -dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + m := j*32 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] ENDFOR -dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 + m := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR -dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + AVX512F - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + m := j*64 IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 i := j*64 + m := j*32 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI -ENDFOR -dst[MAX:512] := 0 +ENDFOR - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + AVX512F - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 i := j*64 + m := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI -ENDFOR -dst[MAX:512] := 0 +ENDFOR - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 + m := j*64 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI -ENDFOR -dst[MAX:512] := 0 +ENDFOR - - - + + AVX512F
immintrin.h
-
- - Floating Point + Store + + + + + + Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] * b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + AVX512F - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] +
immintrin.h
+ Store +
+ + + + + + + Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] +
immintrin.h
+ Store +
+ + + + Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + Load + + + + + Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + Load + + + + + Load 16-bit mask from memory into "k". -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +k[15:0] := MEM[mem_addr+15:mem_addr] - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Load + + + Swizzle + + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE dst[i+31:i] := 0 FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - AVX512F Load - - - - + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 @@ -84550,19 +78667,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Load - - - - - - +
+ + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 @@ -84577,17 +78693,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Load - - - - +
+ + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 @@ -84598,19 +78713,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Load - - - - - - +
+ + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 @@ -84625,17 +78739,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Load - - - - +
+ + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 @@ -84646,19 +78759,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Load - - - - - - +
+ + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 @@ -84673,6183 +78785,6107 @@ FOR j := 0 to 7 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Load + + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Miscellaneous - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 7 +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Load + + + + + Load 512-bits of integer data from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + AVX512F - Miscellaneous - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 15 +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - dst[63:0] := ConvertExpFP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - dst[63:0] := ConvertExpFP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - IF k[0] - dst[63:0] := ConvertExpFP64(b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst[63:0] := ConvertExpFP64(b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - IF k[0] - dst[63:0] := ConvertExpFP64(b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst[63:0] := ConvertExpFP64(b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - dst[31:0] := ConvertExpFP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - dst[31:0] := ConvertExpFP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - IF k[0] - dst[31:0] := ConvertExpFP32(b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst[31:0] := ConvertExpFP32(b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - IF k[0] - dst[31:0] := ConvertExpFP32(b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst[31:0] := ConvertExpFP32(b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 + Load + + + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 7 +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR + Load + + + + + Load 512-bits of integer data from memory into "dst" using a non-temporal memory hint. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - IF k[0] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) + Load + + + + + + + Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + dst[63:0] := MEM[mem_addr+63:mem_addr] ELSE dst[63:0] := src[63:0] FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - IF k[0] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) + Load + + + + + + Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + dst[63:0] := MEM[mem_addr+63:mem_addr] ELSE dst[63:0] := 0 FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) + Load + + + + + + + Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + dst[31:0] := MEM[mem_addr+31:mem_addr] ELSE dst[31:0] := src[31:0] FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - IF k[0] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) + Load + + + + + + Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + dst[31:0] := MEM[mem_addr+31:mem_addr] ELSE dst[31:0] := 0 FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[MAX:32] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - IF k[0] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + Load + + + + + Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -dst[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -2: dst[383:256] := b[127:0] -3: dst[511:384] := b[127:0] -ESAC +dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Copy "a" to "dst", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -dst[511:0] := a[511:0] -CASE (imm8[0]) OF -0: dst[255:0] := b[255:0] -1: dst[511:256] := b[255:0] -ESAC +dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -2: dst[383:256] := b[127:0] -3: dst[511:384] := b[127:0] -ESAC -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F + Load + + Swizzle - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC +m := 0 FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F +
immintrin.h
+ Load +
+ Swizzle - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC +m := 0 FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Copy "a" to "dst", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE (imm8[0]) OF -0: dst[255:0] := b[255:0] -1: dst[511:256] := b[255:0] -ESAC -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F + Load + + Swizzle - - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC +m := 0 FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F +
immintrin.h
+ Load +
+ Swizzle - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC +m := 0 FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 i := j*64 + m := j*32 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 - i := j*64 + i := j*32 + m := j*64 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note] +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 7 i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 + m := j*64 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] +
immintrin.h
+ Load +
+ + + + + Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +k[15:0] := a[15:0] AND b[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +k[15:0] := (NOT a[15:0]) AND b[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] +
immintrin.h
+ Mask +
+ + + + Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +k[15:0] := NOT a[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 +k[15:0] := a[15:0] OR b[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note] +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 +k[15:0] := NOT (a[15:0] XOR b[15:0]) +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". -IF k[0] - dst[63:0] := MAX(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +k[15:0] := a[15:0] XOR b[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 16-bit mask "a" left by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k". -IF k[0] - dst[63:0] := MAX(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] +k[MAX:0] := 0 +IF count[7:0] <= 15 + k[15:0] := a[15:0] << count[7:0] FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 16-bit mask "a" right by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k". -IF k[0] - dst[63:0] := MAX(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 +k[MAX:0] := 0 +IF count[7:0] <= 15 + k[15:0] := a[15:0] >> count[7:0] FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". -IF k[0] - dst[63:0] := MAX(a[63:0], b[63:0]) +tmp[15:0] := a[15:0] OR b[15:0] +IF tmp[15:0] == 0x0 + dst := 1 ELSE - dst[63:0] := 0 + dst := 0 +FI +IF tmp[15:0] == 0xFFFF + MEM[all_ones+7:all_ones] := 1 +ELSE + MEM[all_ones+7:all_ones] := 0 FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note] - -dst[63:0] := MAX(a[63:0], b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] + Mask + + + + + + Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". -IF k[0] - dst[31:0] := MAX(a[31:0], b[31:0]) +tmp[15:0] := a[15:0] OR b[15:0] +IF tmp[15:0] == 0x0 + dst := 1 ELSE - dst[31:0] := src[31:0] + dst := 0 FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". -IF k[0] - dst[31:0] := MAX(a[31:0], b[31:0]) +tmp[15:0] := a[15:0] OR b[15:0] +IF tmp[15:0] == 0xFFFF + dst := 1 ELSE - dst[31:0] := src[31:0] + dst := 0 FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] +
immintrin.h
+ Mask +
+ + + + Convert 16-bit mask "a" into an integer value, and store the result in "dst". -IF k[0] - dst[31:0] := MAX(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst := ZeroExtend32(a[15:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
immintrin.h
+ Mask +
+ + + + Convert integer value "a" into an 16-bit mask, and store the result in "k". -IF k[0] - dst[31:0] := MAX(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +k := ZeroExtend16(a[15:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". -dst[31:0] := MAX(a[31:0], b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +k[15:0] := (NOT a[15:0]) AND b[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +k[15:0] := a[15:0] AND b[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] +
immintrin.h
+ Mask +
+ + + + Copy 16-bit mask "a" to "k". -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +k[15:0] := a[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Mask +
+ + + + Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +k[15:0] := NOT a[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +k[15:0] := a[15:0] OR b[15:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Mask +
+ + + + + Unpack and interleave 8 bits from masks "a" and "b", and store the 16-bit result in "k". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 +k[7:0] := b[7:0] +k[15:8] := a[7:0] +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note] +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 +k[15:0] := NOT (a[15:0] XOR b[15:0]) +k[MAX:16] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +k[15:0] := a[15:0] XOR b[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0. + dst[15:0] := k1[15:0] | k2[15:0] +IF dst == 0 + SetZF() +FI - + + AVX512F
immintrin.h
-
- - Floating Point + Mask + + + + + + Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's. + dst[15:0] := k1[15:0] | k2[15:0] +IF PopCount(dst[15:0]) == 16 + SetCF() +FI + + AVX512F - Special Math Functions - - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] +
immintrin.h
+ Mask +
+ + + + Converts bit mask "k1" into an integer value, storing the results in "dst". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +dst := ZeroExtend32(k1) - + + AVX512F
immintrin.h
-
- - Floating Point + Mask + + + + + Converts integer "mask" into bitmask, storing the result in "dst". + +dst := mask[15:0] + + AVX512F - Special Math Functions - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Mask +
+ + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and stores the low 64 bytes (16 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + dst[i+31:i] := temp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst". + +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) +dst[511:0] := temp[511:0] +dst[MAX:512] := 0 + + AVX512F - Special Math Functions - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + dst[i+63:i] := temp[i+63:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note] + Miscellaneous + + + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and stores the low 64 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := temp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] - -IF k[0] - dst[63:0] := MIN(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := MIN(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] - -IF k[0] - dst[63:0] := MIN(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := MIN(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" , and copy the upper element from "a" to the upper element of "dst". [sae_note] - -dst[63:0] := MIN(a[63:0], b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] - -IF k[0] - dst[31:0] := MIN(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := MIN(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] - -IF k[0] - dst[31:0] := MIN(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := MIN(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] - -dst[31:0] := MIN(a[31:0], b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Load - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - + Miscellaneous + + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Move - - - - Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := 0 - FI + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Load - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Move - - - - Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 + Miscellaneous + + + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Move - - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -tmp[191:128] := a[191:128] -tmp[255:192] := a[191:128] -tmp[319:256] := a[319:256] -tmp[383:320] := a[319:256] -tmp[447:384] := a[447:384] -tmp[511:448] := a[447:384] +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Move - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -tmp[191:128] := a[191:128] -tmp[255:192] := a[191:128] -tmp[319:256] := a[319:256] -tmp[383:320] := a[319:256] -tmp[447:384] := a[447:384] -tmp[511:448] := a[447:384] +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Move - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst". - -dst[63:0] := a[63:0] -dst[127:64] := a[63:0] -dst[191:128] := a[191:128] -dst[255:192] := a[191:128] -dst[319:256] := a[319:256] -dst[383:320] := a[319:256] -dst[447:384] := a[447:384] -dst[511:448] := a[447:384] -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Load - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 15 - i := j*32 + Miscellaneous + + + + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Move - - - - Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := 0 - FI + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Move - - - - Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := 0 - FI + Miscellaneous + + + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load - - - Load 512-bits of integer data from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Load - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - + Miscellaneous + + + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Store - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} FOR j := 0 to 15 i := j*32 IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := a[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Store - - - - Store 512-bits of integer data from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - -
immintrin.h
-
- - Integer - AVX512F - Load - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 + Miscellaneous + + + + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Store - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - -
immintrin.h
-
- - Integer + AVX512F - Load - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Load - - - Load 512-bits of integer data from memory into "dst" using a non-temporal memory hint. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 + Miscellaneous + + + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Store - - - - Store 512-bits of integer data from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - -
immintrin.h
-
- - Floating Point - AVX512F - Store - - - - Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] + Miscellaneous + + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - - - - Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - -
immintrin.h
-
- - Floating Point - AVX512F - Load - - - - - Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - + Miscellaneous + + + + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} IF k[0] - dst[63:0] := MEM[mem_addr+63:mem_addr] + dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE - dst[63:0] := src[63:0] + dst[63:0] := a[63:0] FI -dst[MAX:64] := 0 +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Move - - - - - - Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} IF k[0] - dst[63:0] := b[63:0] + dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE - dst[63:0] := src[63:0] + dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - - - - - Store the lower double-precision (64-bit) floating-point element from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - MEM[mem_addr+63:mem_addr] := a[63:0] -FI - -
immintrin.h
-
- - Floating Point - AVX512F - Load - - - - Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - + Miscellaneous + + + + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} IF k[0] - dst[63:0] := MEM[mem_addr+63:mem_addr] + dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI -dst[MAX:64] := 0 +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Move - - - - - Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} IF k[0] - dst[63:0] := b[63:0] + dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Move - - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -tmp[159:128] := a[191:160] -tmp[191:160] := a[191:160] -tmp[223:192] := a[255:224] -tmp[255:224] := a[255:224] -tmp[287:256] := a[319:288] -tmp[319:288] := a[319:288] -tmp[351:320] := a[383:352] -tmp[383:352] := a[383:352] -tmp[415:384] := a[447:416] -tmp[447:416] := a[447:416] -tmp[479:448] := a[511:480] -tmp[511:480] := a[511:480] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Move - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -tmp[159:128] := a[191:160] -tmp[191:160] := a[191:160] -tmp[223:192] := a[255:224] -tmp[255:224] := a[255:224] -tmp[287:256] := a[319:288] -tmp[319:288] := a[319:288] -tmp[351:320] := a[383:352] -tmp[383:352] := a[383:352] -tmp[415:384] := a[447:416] -tmp[447:416] := a[447:416] -tmp[479:448] := a[511:480] -tmp[511:480] := a[511:480] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Move - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - -dst[31:0] := a[63:32] -dst[63:32] := a[63:32] -dst[95:64] := a[127:96] -dst[127:96] := a[127:96] -dst[159:128] := a[191:160] -dst[191:160] := a[191:160] -dst[223:192] := a[255:224] -dst[255:224] := a[255:224] -dst[287:256] := a[319:288] -dst[319:288] := a[319:288] -dst[351:320] := a[383:352] -dst[383:352] := a[383:352] -dst[415:384] := a[447:416] -dst[447:416] := a[447:416] -dst[479:448] := a[511:480] -dst[511:480] := a[511:480] -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Move - - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -tmp[159:128] := a[159:128] -tmp[191:160] := a[159:128] -tmp[223:192] := a[223:192] -tmp[255:224] := a[223:192] -tmp[287:256] := a[287:256] -tmp[319:288] := a[287:256] -tmp[351:320] := a[351:320] -tmp[383:352] := a[351:320] -tmp[415:384] := a[415:384] -tmp[447:416] := a[415:384] -tmp[479:448] := a[479:448] -tmp[511:480] := a[479:448] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512F - Move - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -tmp[159:128] := a[159:128] -tmp[191:160] := a[159:128] -tmp[223:192] := a[223:192] -tmp[255:224] := a[223:192] -tmp[287:256] := a[287:256] -tmp[319:288] := a[287:256] -tmp[351:320] := a[351:320] -tmp[383:352] := a[351:320] -tmp[415:384] := a[415:384] -tmp[447:416] := a[415:384] -tmp[479:448] := a[479:448] -tmp[511:480] := a[479:448] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Move - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - -dst[31:0] := a[31:0] -dst[63:32] := a[31:0] -dst[95:64] := a[95:64] -dst[127:96] := a[95:64] -dst[159:128] := a[159:128] -dst[191:160] := a[159:128] -dst[223:192] := a[223:192] -dst[255:224] := a[223:192] -dst[287:256] := a[287:256] -dst[319:288] := a[287:256] -dst[351:320] := a[351:320] -dst[383:352] := a[351:320] -dst[415:384] := a[415:384] -dst[447:416] := a[415:384] -dst[479:448] := a[479:448] -dst[511:480] := a[479:448] -dst[MAX:512] := 0 + Miscellaneous + + + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Load - - - - - Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - dst[31:0] := MEM[mem_addr+31:mem_addr] -ELSE - dst[31:0] := src[31:0] -FI -dst[MAX:32] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Move - - - - - - Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := b[31:0] -ELSE - dst[31:0] := src[31:0] -FI + Miscellaneous + + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Store - - - - - Store the lower single-precision (32-bit) floating-point element from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - MEM[mem_addr+31:mem_addr] := a[31:0] -FI - - -
immintrin.h
-
- - Floating Point + AVX512F - Load - - - - Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - dst[31:0] := MEM[mem_addr+31:mem_addr] -ELSE - dst[31:0] := 0 -FI -dst[MAX:32] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Move - - - - - Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - + Miscellaneous + + + + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} IF k[0] - dst[31:0] := b[31:0] + dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) ELSE - dst[31:0] := 0 + dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Load - - - Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Load - - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Store - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - -
immintrin.h
-
- - Floating Point - AVX512F - Load - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Store - - - - Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] + Miscellaneous + + + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +IF k[0] + dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Load - - - Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Load - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 + Miscellaneous + + + + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +IF k[0] + dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - -
immintrin.h
-
- - Floating Point - AVX512F - Load - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 + Miscellaneous + + + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +IF k[0] + dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - - - - Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 + Miscellaneous + + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + dst[63:0] := ConvertExpFP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Arithmetic - - - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] * b[63:0] +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + dst[63:0] := ConvertExpFP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + IF k[0] + dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] * b[63:0] +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] * b[63:0] +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + IF k[0] + dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] * b[63:0] +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := a[63:0] * b[63:0] -dst[127:64] := a[127:64] +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + dst[31:0] := ConvertExpFP32(b[31:0]) +dst[127:32] := a[127:32] dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + dst[31:0] := ConvertExpFP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Arithmetic - - - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] * b[31:0] +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + IF k[0] + dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] * b[31:0] +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] * b[31:0] +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + IF k[0] + dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] * b[31:0] +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := a[31:0] * b[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Special Math Functions - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ABS(a[i+31:i]) -ENDFOR + Miscellaneous + + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ABS(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Special Math Functions - - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 + Miscellaneous + + + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Miscellaneous + + + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Special Math Functions - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Miscellaneous + + + + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + IF k[0] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Arithmetic - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + IF k[0] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + IF k[0] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + IF k[0] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Miscellaneous + + + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Logical - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Miscellaneous + + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] - ELSE - dst[i+31:i] := 0 +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Logical - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - + RETURN tmp[63:0] +} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Set - - - Broadcast 8-bit integer "a" to all elements of "dst". +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := a[7:0] +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst". +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[31:0] +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Set - - - - Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[31:0] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Set - - - Broadcast 32-bit integer "a" to all elements of "dst". +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} FOR j := 0 to 15 i := j*32 - dst[i+31:i] := a[31:0] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Miscellaneous + + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] FI -ENDFOR -dst[MAX:512] := 0 + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] FI -ENDFOR -dst[MAX:512] := 0 + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Set - - - - Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] FI -ENDFOR -dst[MAX:512] := 0 + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Set - - - Broadcast 64-bit integer "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Set - - - Broadcast the low packed 16-bit integer from "a" to all all elements of "dst". + Miscellaneous + + + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR -dst[MAX:512] := 0 +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Miscellaneous + + + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] FI -ENDFOR -k[MAX:16] := 0 + RETURN tmp[63:0] +} +dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + Miscellaneous + + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note] -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + Miscellaneous + + + + + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + Miscellaneous + + + + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] FI -ENDFOR -k[MAX:8] := 0 + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] FI -ENDFOR -k[MAX:8] := 0 + RETURN tmp[31:0] +} +dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Miscellaneous +
+ + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] FI -ENDFOR -k[MAX:8] := 0 + RETURN tmp[31:0] +} +dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + Miscellaneous + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + Miscellaneous + + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512F - Compare - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 + Miscellaneous + + + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -k[MAX:8] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 32 -m := 0 + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[511:m] := src[511:m] dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Store - Swizzle - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 32 -m := base_addr -FOR j := 0 to 15 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 32 -m := 0 + Miscellaneous + + + + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 64 -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[511:m] := src[511:m] dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Store - Swizzle - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 7 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 64 -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size + Miscellaneous + + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 15 i := j*32 - id := idx[i+3:i]*32 IF k[j] - dst[i+31:i] := a[id+31:id] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 15 i := j*32 - id := idx[i+3:i]*32 IF k[j] - dst[i+31:i] := a[id+31:id] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] + Miscellaneous + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+31:i] := idx[i+31:i] + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 15 i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := a[i+31:i] - FI + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+31:i] := 0 + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 - - - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} FOR j := 0 to 15 i := j*32 - off := idx[i+3:i]*32 - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set) - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] + Miscellaneous + + + + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+63:i] := a[i+63:i] + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +IF k[0] + dst[63:0] := SCALE(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+63:i] := 0 + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +IF k[0] + dst[63:0] := SCALE(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] + Miscellaneous + + + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+31:i] := idx[i+31:i] + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +IF k[0] + dst[63:0] := SCALE(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+31:i] := a[i+31:i] + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +IF k[0] + dst[63:0] := SCALE(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+31:i] := 0 + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +dst[63:0] := SCALE(a[63:0], b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] + Miscellaneous + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+63:i] := idx[i+63:i] + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +dst[63:0] := SCALE(a[63:0], b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+63:i] := a[i+63:i] + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +IF k[0] + dst[31:0] := SCALE(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+63:i] := 0 + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +IF k[0] + dst[31:0] := SCALE(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI -IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI -IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI -IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI -IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI -IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI -IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI -IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI -IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + Miscellaneous + + + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+63:i] := src[i+63:i] + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI -ENDFOR -dst[MAX:512] := 0 + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +IF k[0] + dst[31:0] := SCALE(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI -IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI -IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI -IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI -IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI -IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI -IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI -IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI -IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI ELSE - dst[i+63:i] := src[i+63:i] + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +IF k[0] + dst[31:0] := SCALE(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +dst[31:0] := SCALE(a[31:0], b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +dst[31:0] := SCALE(a[31:0], b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 + dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI -IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI -IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI -IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI -IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI -IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI -IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI -IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI -IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := a[n+31:n] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI -IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI -IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI -IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI -IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI -IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI -IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI -IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI -IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := a[n+31:n] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". +
+ + + + Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". -IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI -IF (imm8[4] == 0) dst[319:256] := a[319:256]; FI -IF (imm8[4] == 1) dst[319:256] := a[383:320]; FI -IF (imm8[5] == 0) dst[383:320] := a[319:256]; FI -IF (imm8[5] == 1) dst[383:320] := a[383:320]; FI -IF (imm8[6] == 0) dst[447:384] := a[447:384]; FI -IF (imm8[6] == 1) dst[447:384] := a[511:448]; FI -IF (imm8[7] == 0) dst[511:448] := a[447:384]; FI -IF (imm8[7] == 1) dst[511:448] := a[511:448]; FI +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". +
+ + + + + + Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF (b[1] == 0) dst[63:0] := a[63:0]; FI -IF (b[1] == 1) dst[63:0] := a[127:64]; FI -IF (b[65] == 0) dst[127:64] := a[63:0]; FI -IF (b[65] == 1) dst[127:64] := a[127:64]; FI -IF (b[129] == 0) dst[191:128] := a[191:128]; FI -IF (b[129] == 1) dst[191:128] := a[255:192]; FI -IF (b[193] == 0) dst[255:192] := a[191:128]; FI -IF (b[193] == 1) dst[255:192] := a[255:192]; FI -IF (b[257] == 0) dst[319:256] := a[319:256]; FI -IF (b[257] == 1) dst[319:256] := a[383:320]; FI -IF (b[321] == 0) dst[383:320] := a[319:256]; FI -IF (b[321] == 1) dst[383:320] := a[383:320]; FI -IF (b[385] == 0) dst[447:384] := a[447:384]; FI -IF (b[385] == 1) dst[447:384] := a[511:448]; FI -IF (b[449] == 0) dst[511:448] := a[447:384]; FI -IF (b[449] == 1) dst[511:448] := a[511:448]; FI +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+63:i] := a[n+63:n] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Swizzle + + + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
Swizzle - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) -tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) -tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) -tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) -tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) -tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) -tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) -tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) -tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) -tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) -tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) -tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) FOR j := 0 to 15 i := j*32 + n := (j % 4)*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 + n := (j % 4)*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Swizzle + + + + + Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
Swizzle - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) -tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) -tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) -tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) -tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) -tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) -tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) -tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) -tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) -tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) -tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) -tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+63:i] := a[n+63:n] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". +
+ + + + + Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". +
+ + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], b[1:0]) -dst[63:32] := SELECT4(a[127:0], b[33:32]) -dst[95:64] := SELECT4(a[127:0], b[65:64]) -dst[127:96] := SELECT4(a[127:0], b[97:96]) -dst[159:128] := SELECT4(a[255:128], b[129:128]) -dst[191:160] := SELECT4(a[255:128], b[161:160]) -dst[223:192] := SELECT4(a[255:128], b[193:192]) -dst[255:224] := SELECT4(a[255:128], b[225:224]) -dst[287:256] := SELECT4(a[383:256], b[257:256]) -dst[319:288] := SELECT4(a[383:256], b[289:288]) -dst[351:320] := SELECT4(a[383:256], b[321:320]) -dst[383:352] := SELECT4(a[383:256], b[353:352]) -dst[415:384] := SELECT4(a[511:384], b[385:384]) -dst[447:416] := SELECT4(a[511:384], b[417:416]) -dst[479:448] := SELECT4(a[511:384], b[449:448]) -dst[511:480] := SELECT4(a[511:384], b[481:480]) +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - id := idx[i+2:i]*64 IF k[j] - dst[i+63:i] := a[id+63:id] + dst[i+63:i] := a[63:0] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - id := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := 0 - FI +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - id := idx[i+2:i]*64 - dst[i+63:i] := a[id+63:id] -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F Swizzle - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - id := idx[i+3:i]*32 IF k[j] - dst[i+31:i] := a[id+31:id] + dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - id := idx[i+3:i]*32 IF k[j] - dst[i+31:i] := a[id+31:id] + dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Swizzle - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". +
+ + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - dst[i+31:i] := a[id+31:id] +size := 64 +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI ENDFOR +dst[511:m] := src[511:m] dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F +
immintrin.h
Swizzle - - - - - - Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +size := 64 +m := 0 FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR +dst[511:m] := 0 dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F +
immintrin.h
Swizzle - - - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". -FOR j := 0 to 7 - i := j*64 - id := idx[i+2:i]*64 +size := 32 +m := 0 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := src[i+63:i] + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR +dst[511:m] := src[511:m] dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F +
immintrin.h
Swizzle - - - - - Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 +size := 32 +m := 0 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR +dst[511:m] := 0 dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F +
immintrin.h
Swizzle - - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 i := j*64 - id := idx[i+2:i]*64 IF k[j] - dst[i+63:i] := a[id+63:id] + dst[i+63:i] := a[m+63:m] + m := m + 64 ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F Swizzle - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + + + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 i := j*64 - id := idx[i+2:i]*64 - dst[i+63:i] := a[id+63:id] + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F +
immintrin.h
Swizzle - - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). m := 0 FOR j := 0 to 15 @@ -90863,17807 +84899,16803 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load +
immintrin.h
Swizzle - - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). m := 0 FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + dst[i+31:i] := a[m+31:m] m := m + 32 ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[1:0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +2: dst[127:0] := a[383:256] +3: dst[127:0] := a[511:384] +ESAC +dst[MAX:128] := 0 + + AVX512F +
immintrin.h
Swizzle - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 15 +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load +
immintrin.h
Swizzle - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 15 +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 3 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 + dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[255:0] := a[255:0] +1: dst[255:0] := a[511:256] +ESAC +dst[MAX:256] := 0 + + AVX512F +
immintrin.h
Swizzle - - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 7 +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load +
immintrin.h
Swizzle - - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 7 +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 3 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 + dst[i+63:i] := tmp[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[1:0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +2: dst[127:0] := a[383:256] +3: dst[127:0] := a[511:384] +ESAC +dst[MAX:128] := 0 + + AVX512F +
immintrin.h
Swizzle - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 7 - i := j*64 +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load +
immintrin.h
Swizzle - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 7 - i := j*64 +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Swizzle +
+ + + + + Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:512] := 0 +CASE imm8[0] OF +0: dst[255:0] := a[255:0] +1: dst[255:0] := a[511:256] +ESAC +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Swizzle +
+ + + + + + + Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 3 i := j*64 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Load - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Swizzle +
+ + + + + + Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - m := j*64 +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 3 + i := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+63:i] := tmp[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". -FOR j := 0 to 7 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR +dst[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +2: dst[383:256] := b[127:0] +3: dst[511:384] := b[127:0] +ESAC dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Load - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Swizzle +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 - m := j*64 +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 15 + i := j*32 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + + Copy "a" to "dst", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE (imm8[0]) OF +0: dst[255:0] := b[255:0] +1: dst[511:256] := b[255:0] +ESAC +dst[MAX:512] := 0 + + AVX512F - Special Math Functions - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +2: dst[383:256] := b[127:0] +3: dst[511:384] := b[127:0] +ESAC +dst[MAX:512] := 0 + + AVX512F - Special Math Functions - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + + Copy "a" to "dst", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE (imm8[0]) OF +0: dst[255:0] := b[255:0] +1: dst[511:256] := b[255:0] +ESAC +dst[MAX:512] := 0 + + AVX512F - Special Math Functions - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + dst[i+31:i] := a[31:0] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + dst[i+31:i] := a[31:0] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Special Math Functions - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + dst[i+63:i] := a[63:0] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +size := 32 +m := 0 FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR +dst[511:m] := src[511:m] dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. -FOR j := 0 to 7 - i := j*64 +size := 32 +m := 0 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] + dst[m+size-1:m] := a[i+31:i] + m := m + size FI ENDFOR +dst[511:m] := 0 dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +size := 64 +m := 0 FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 + dst[m+size-1:m] := a[i+63:i] + m := m + size FI ENDFOR +dst[511:m] := src[511:m] dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Special Math Functions - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +size := 64 +m := 0 FOR j := 0 to 7 i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI ENDFOR +dst[511:m] := 0 dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := 32*j - l := 8*j + i := j*32 + id := idx[i+3:i]*32 IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) + dst[i+31:i] := a[id+31:id] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Swizzle + + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := 32*j - l := 8*j + i := j*32 + id := idx[i+3:i]*32 IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) + dst[i+31:i] := a[id+31:id] ELSE - dst[l+7:l] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+31:i]) + i := j*32 + id := idx[i+3:i]*32 + dst[i+31:i] := a[id+31:id] ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := 32*j - l := 16*j + i := j*32 + off := idx[i+3:i]*32 IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+31:i] := idx[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := 32*j - l := 16*j + i := j*32 + off := idx[i+3:i]*32 IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := 32*j - l := 16*j + i := j*32 + off := idx[i+3:i]*32 IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) + dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] ELSE - dst[l+15:l] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512F - Convert - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+63:i]) +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512F - Convert - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set) FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 + off := idx[i+2:i]*64 IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+63:i] := idx[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 + off := idx[i+2:i]*64 IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 + off := idx[i+2:i]*64 IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) + dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] ELSE - dst[l+7:l] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512F - Convert - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[k+31:k] := Truncate32(a[i+63:i]) + i := j*64 + off := idx[i+2:i]*64 + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512F - Convert - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE - dst[l+31:l] := src[l+31:l] + dst[i+31:i] := idx[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 IF k[j] - MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) + dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] ELSE - dst[l+31:l] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512F - Convert - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+63:i]) +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512F - Convert - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 64*j - l := 16*j + i := j*64 + off := idx[i+2:i]*64 IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+63:i] := idx[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Swizzle + + + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 64*j - l := 16*j + i := j*64 + off := idx[i+2:i]*64 IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE - dst[l+15:l] := 0 + dst[i+63:i] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - l := 8*j +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) + dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512F - Convert - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) - FI +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + + AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - l := 8*j +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI +IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI +IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI +IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI +IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI +IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI +IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI +IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI +IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[l+7:l] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - l := 16*j +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI +IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI +IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI +IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI +IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI +IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI +IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI +IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI +IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Swizzle + + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - l := 16*j +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI +IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI +IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI +IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI +IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI +IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI +IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI +IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI +IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[l+15:l] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI +IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI +IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI +IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI +IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI +IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI +IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI +IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI +IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) - FI -ENDFOR +IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI +IF (imm8[4] == 0) dst[319:256] := a[319:256]; FI +IF (imm8[4] == 1) dst[319:256] := a[383:320]; FI +IF (imm8[5] == 0) dst[383:320] := a[319:256]; FI +IF (imm8[5] == 1) dst[383:320] := a[383:320]; FI +IF (imm8[6] == 0) dst[447:384] := a[447:384]; FI +IF (imm8[6] == 1) dst[447:384] := a[511:448]; FI +IF (imm8[7] == 0) dst[511:448] := a[447:384]; FI +IF (imm8[7] == 1) dst[511:448] := a[511:448]; FI +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". + +IF (b[1] == 0) dst[63:0] := a[63:0]; FI +IF (b[1] == 1) dst[63:0] := a[127:64]; FI +IF (b[65] == 0) dst[127:64] := a[63:0]; FI +IF (b[65] == 1) dst[127:64] := a[127:64]; FI +IF (b[129] == 0) dst[191:128] := a[191:128]; FI +IF (b[129] == 1) dst[191:128] := a[255:192]; FI +IF (b[193] == 0) dst[255:192] := a[191:128]; FI +IF (b[193] == 1) dst[255:192] := a[255:192]; FI +IF (b[257] == 0) dst[319:256] := a[319:256]; FI +IF (b[257] == 1) dst[319:256] := a[383:320]; FI +IF (b[321] == 0) dst[383:320] := a[319:256]; FI +IF (b[321] == 1) dst[383:320] := a[383:320]; FI +IF (b[385] == 0) dst[447:384] := a[447:384]; FI +IF (b[385] == 1) dst[447:384] := a[511:448]; FI +IF (b[449] == 0) dst[511:448] := a[447:384]; FI +IF (b[449] == 1) dst[511:448] := a[511:448]; FI +dst[MAX:512] := 0 + + AVX512F - Convert - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 8*j +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[l+7:l] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[k+31:k] := Saturate32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) +tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) +tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) +tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) +tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) +tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) +tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) +tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) +tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) +tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) +tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) +tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[l+31:l] := src[l+31:l] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 IF k[j] - MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) +tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) +tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) +tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) +tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) +tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) +tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) +tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) +tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) +tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) +tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) +tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[l+31:l] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], b[1:0]) +dst[63:32] := SELECT4(a[127:0], b[33:32]) +dst[95:64] := SELECT4(a[127:0], b[65:64]) +dst[127:96] := SELECT4(a[127:0], b[97:96]) +dst[159:128] := SELECT4(a[255:128], b[129:128]) +dst[191:160] := SELECT4(a[255:128], b[161:160]) +dst[223:192] := SELECT4(a[255:128], b[193:192]) +dst[255:224] := SELECT4(a[255:128], b[225:224]) +dst[287:256] := SELECT4(a[383:256], b[257:256]) +dst[319:288] := SELECT4(a[383:256], b[289:288]) +dst[351:320] := SELECT4(a[383:256], b[321:320]) +dst[383:352] := SELECT4(a[383:256], b[353:352]) +dst[415:384] := SELECT4(a[511:384], b[385:384]) +dst[447:416] := SELECT4(a[511:384], b[417:416]) +dst[479:448] := SELECT4(a[511:384], b[449:448]) +dst[511:480] := SELECT4(a[511:384], b[481:480]) +dst[MAX:512] := 0 + + AVX512F - Convert - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 - i := 64*j - l := 16*j + i := j*64 IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 64*j - l := 16*j + i := j*64 + id := idx[i+2:i]*64 IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Convert - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 64*j - l := 16*j + i := j*64 + id := idx[i+2:i]*64 IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) + dst[i+63:i] := a[id+63:id] ELSE - dst[l+15:l] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +dst[MAX:512] := 0 + + AVX512F - Convert - - - Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[i+31:i] := SignExtend32(a[k+7:k]) +FOR j := 0 to 7 + i := j*64 + id := idx[i+2:i]*64 + dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - - Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := 32*j - l := 8*j + i := j*32 + id := idx[i+3:i]*32 IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) + dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := 32*j - l := 8*j + i := j*32 + id := idx[i+3:i]*32 IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) + dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[i+63:i] := SignExtend64(a[k+7:k]) +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 + id := idx[i+2:i]*64 IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) + dst[i+63:i] := a[id+63:id] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[i+63:i] := SignExtend64(a[k+31:k]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 - i := 64*j - l := 32*j + i := j*64 IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 64*j - l := 32*j + i := j*64 + id := idx[i+2:i]*64 IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) + dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Swizzle + + + + + + Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +dst[MAX:512] := 0 + + AVX512F - Convert - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". -FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[i+31:i] := SignExtend32(a[k+15:k]) +FOR j := 0 to 7 + i := j*64 + id := idx[i+2:i]*64 + dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 15 i := j*32 - l := j*16 IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) + dst[i+31:i] := a[m+31:m] + m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 15 - i := 32*j - l := 16*j + i := j*32 IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) + dst[i+31:i] := a[m+31:m] + m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[i+63:i] := SignExtend64(a[k+15:k]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 - i := 64*j - l := 16*j + i := j*64 IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) + dst[i+63:i] := a[m+63:m] + m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +m := 0 FOR j := 0 to 7 - i := 64*j - l := 16*j + i := j*64 IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) + dst[i+63:i] := a[m+63:m] + m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) FOR j := 0 to 15 - i := 32*j - l := 8*j + i := j*32 IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Swizzle + + + + + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 - i := 32*j - l := 8*j + i := j*32 IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[l+7:l] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 - i := 32*j - l := 16*j + i := j*32 IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Swizzle + + + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[l+7:l] := src[l+7:l] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Swizzle + + + + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[l+7:l] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[k+31:k] := SaturateU32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[l+31:l] := src[l+31:l] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Swizzle + + + + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[l+31:l] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := 64*j - l := 16*j + i := j*64 IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[l+15:l] := src[l+15:l] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) - FI -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Swizzle + + + + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := 64*j - l := 16*j + i := j*64 IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE - dst[l+15:l] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[i+31:i] := ZeroExtend32(a[k+7:k]) -ENDFOR +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 - i := 32*j - l := 8*j + i := j*32 IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 - i := 32*j - l := 8*j + i := j*32 IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[i+63:i] := ZeroExtend64(a[k+7:k]) -ENDFOR +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - - Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 - i := 64*j - l := 8*j + i := j*64 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[i+63:i] := ZeroExtend64(a[k+31:k]) -ENDFOR +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 64*j - l := 32*j +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[i+31:i] := ZeroExtend32(a[k+15:k]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 - i := 32*j - l := 16*j + i := j*32 IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[i+63:i] := ZeroExtend64(a[k+15:k]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Convert - - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) -ENDFOR +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] +tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] +tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] +tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] +tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] +tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] +tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+31:i] * b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Logical - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Swizzle + + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR +dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] +dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] +dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] +dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] } +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] } +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] } -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) -ENDFOR +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +dst[351:320] := SELECT4(b[383:256], imm8[5:4]) +dst[383:352] := SELECT4(b[383:256], imm8[7:6]) +dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +dst[479:448] := SELECT4(b[511:384], imm8[5:4]) +dst[511:480] := SELECT4(b[511:384], imm8[7:6]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] } +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] } +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] } -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] } +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] } +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] } -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] } +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] } +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] } -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) -ENDFOR +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] } +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + Swizzle + + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] } +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + dst[i+31:i] := tmp_dst[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + Swizzle + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] } -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Miscellaneous - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 + Swizzle + + + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 +k[MAX:1] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Shift - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 + Compare + + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 +k[MAX:1] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Shift - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 + Compare + + + + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Store - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Store - - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR + Compare + + + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Store - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Store - - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR + Compare + + + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 +k[MAX:1] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Store - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F - Store - - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR + Compare + + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 +k[MAX:1] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 + Compare + + + + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Compare + + + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +
immintrin.h
+ Compare +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0 - + + AVX512F
immintrin.h
-
- - Integer + Compare + + + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0 + + + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 7 i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 7 i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 7 i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 7 i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 7 i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". FOR j := 0 to 7 i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 7 i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := 0 - FI + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := 0 +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Compare + + + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 7 i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 7 i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Compare + + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 7 i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Compare + + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 7 i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Compare + + + + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := 0 - FI + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Compare +
+ + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI + m := j*64 + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 + m := j*64 IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[m+63:m] := src[m+63:m] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 + m := j*64 IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[m+63:m] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] FOR j := 0 to 15 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := 32*j IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + [round_note] FOR j := 0 to 7 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 - i := j*64 + i := j*32 + l := j*64 IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 + i := j*32 + l := j*64 IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE - dst[i+63:i] := 0 - FI + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Shift - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := 32*j + l := 64*j IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := 0 - FI + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Convert + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Arithmetic - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 - i := j*64 + i := j*32 + l := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE - dst[i+63:i] := 0 - FI + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 + l := j*64 IF k[j] - FOR h := 0 to 31 - index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR j := 0 to 7 i := j*32 + l := j*64 IF k[j] - FOR h := 0 to 31 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + [round_note] -FOR j := 0 to 15 - i := j*32 - FOR h := 0 to 31 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". FOR j := 0 to 7 - i := j*64 - IF k[j] - FOR h := 0 to 63 - index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR - ELSE - dst[i+63:i] := src[i+63:i] - FI + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 - i := j*64 + i := j*32 + l := j*64 IF k[j] - FOR h := 0 to 63 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - FOR h := 0 to 63 - index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] - dst[i+h] := imm8[index[2:0]] - ENDFOR + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Logical - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) ELSE - k[j] := 0 + dst[i+31:i] := 0 FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Logical - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Logical - - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note] FOR j := 0 to 15 i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI + m := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Logical - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 15 i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 + m := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Logical - - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 +FOR j := 0 to 15 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE - k[j] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask + AVX512F - Logical - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 +FOR j := 0 to 15 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 + m := j*16 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 + m := j*16 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + [round_note] -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Convert + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Swizzle - - - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note] -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Convert + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Swizzle - - - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 32*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 32*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Logical - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := 64*j + l := 32*j IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Logical - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 32*j IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [sae_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Elementary Math Functions - - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [sae_note] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := 16*j + l := 32*j + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Elementary Math Functions - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := 16*j + l := 32*j IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) ELSE - dst[i+63:i] := 0 + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Elementary Math Functions - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + [round_note] -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Elementary Math Functions - - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Elementary Math Functions - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 15 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] + +dst[31:0] := Convert_FP64_To_Int32(a[63:0]) + + AVX512F - Elementary Math Functions - - - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] -IF k[0] - dst[63:0] := (1.0 / b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] + +dst[31:0] := Convert_FP64_To_Int32(a[63:0]) + + AVX512F - Elementary Math Functions - - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] -IF k[0] - dst[63:0] := (1.0 / b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + +dst[31:0] := Convert_FP64_To_Int32(a[63:0]) + + AVX512F - Elementary Math Functions - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". -dst[63:0] := (1.0 / b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_FP64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Elementary Math Functions - - - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] IF k[0] - dst[31:0] := (1.0 / b[31:0]) + dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := Convert_FP64_To_FP32(b[63:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Elementary Math Functions - - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] IF k[0] - dst[31:0] := (1.0 / b[31:0]) + dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". -dst[31:0] := (1.0 / b[31:0]) +IF k[0] + dst[31:0] := Convert_FP64_To_FP32(b[63:0]) +ELSE + dst[31:0] := 0 +FI dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". + [round_note] -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +dst[31:0] := Convert_FP64_To_UInt32(a[63:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". + [round_note] -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +dst[63:0] := Convert_FP64_To_UInt64(a[63:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +dst[31:0] := Convert_FP64_To_UInt32(a[63:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +dst[63:0] := Convert_FP64_To_UInt64(a[63:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +dst[63:0] := Convert_Int32_To_FP64(b[31:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Miscellaneous - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Miscellaneous - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [sae_note] + +dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Miscellaneous - - - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [sae_note] -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} IF k[0] - dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) + dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} IF k[0] - dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) + dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [sae_note] -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} IF k[0] - dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) + dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} IF k[0] - dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) + dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". + [round_note] -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[31:0] := Convert_FP32_To_UInt32(a[31:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". + [round_note] + +dst[63:0] := Convert_FP32_To_UInt64(a[31:0]) + + AVX512F - Miscellaneous - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[31:0] := Convert_FP32_To_UInt32(a[31:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_UInt64(a[31:0]) + + AVX512F - Elementary Math Functions - - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) - ELSE - dst[i+63:i] := src[i+63:i] - FI + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) - ELSE - dst[i+63:i] := 0 - FI + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := 32*j + l := 64*j IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := 32*j + l := 64*j IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - - Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[63:0] := (1.0 / SQRT(b[63:0])) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. + Convert + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] -IF k[0] - dst[63:0] := (1.0 / SQRT(b[63:0])) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. - -dst[63:0] := (1.0 / SQRT(b[63:0])) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - - Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. + Convert + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". -IF k[0] - dst[31:0] := (1.0 / SQRT(b[31:0])) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[31:0] := (1.0 / SQRT(b[31:0])) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. + Convert + + + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] -dst[31:0] := (1.0 / SQRT(b[31:0])) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} FOR j := 0 to 7 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + FOR j := 0 to 7 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 - i := j*64 + i := 32*j + l := 64*j IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + FOR j := 0 to 15 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI + dst[i+31:i] := 0 FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 15 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] + FOR j := 0 to 15 - i := j*32 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 32*j IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI + dst[i+31:i] := src[i+31:i] FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI + dst[i+31:i] := 0 FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI + dst[i+31:i] := 0 FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -IF k[0] - dst[63:0] := SCALE(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) + + AVX512F - Miscellaneous - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -IF k[0] - dst[63:0] := SCALE(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) + + AVX512F - Miscellaneous - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -IF k[0] - dst[63:0] := SCALE(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) + + AVX512F - Miscellaneous - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -IF k[0] - dst[63:0] := SCALE(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0]) + + AVX512F - Miscellaneous - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -dst[63:0] := SCALE(a[63:0], b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0]) + + AVX512F - Miscellaneous - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -dst[63:0] := SCALE(a[63:0], b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) + + AVX512F - Miscellaneous - - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -IF k[0] - dst[31:0] := SCALE(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) + + AVX512F - Miscellaneous - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -IF k[0] - dst[31:0] := SCALE(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) + + AVX512F - Miscellaneous - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -IF k[0] - dst[31:0] := SCALE(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Miscellaneous - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -IF k[0] - dst[31:0] := SCALE(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 + Convert + + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) - -
immintrin.h
-
- - Floating Point + AVX512F - Miscellaneous - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -dst[31:0] := SCALE(a[31:0], b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0]) + + AVX512F - Miscellaneous - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -dst[31:0] := SCALE(a[31:0], b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0]) - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0]) + + AVX512F - Store - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 7 i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + l := j*32 + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - m := j*32 + l := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + l := j*32 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] -FOR j := 0 to 7 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Store - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 - i := j*32 + i := 32*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Convert + + + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := 32*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Swizzle - - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 +dst[63:0] := Convert_Int32_To_FP64(b[31:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Swizzle - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + + Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -dst[MAX:512] := 0 +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Convert + + + + + + Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Swizzle - - - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 - i := j*32 + i := 32*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[l+7:l] := Truncate8(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) FI ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Swizzle - - - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := 32*j + l := 8*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[l+7:l] := Truncate8(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Integer + Convert + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Swizzle - - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := 32*j + l := 16*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[l+15:l] := Truncate16(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Swizzle - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -dst[MAX:512] := 0 +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) + FI +ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] -tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] -tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] -tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] -tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := 32*j + l := 16*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[l+15:l] := Truncate16(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+15:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Swizzle - - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] -tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] -tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] -tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] -tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 8*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[l+7:l] := Truncate8(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". - -dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] -dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] -dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] -dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] -dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Swizzle - - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Convert + + + Store + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := 64*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) FI ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := 64*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[l+7:l] := Truncate8(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -dst[351:320] := SELECT4(b[383:256], imm8[5:4]) -dst[383:352] := SELECT4(b[383:256], imm8[7:6]) -dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -dst[479:448] := SELECT4(b[511:384], imm8[5:4]) -dst[511:480] := SELECT4(b[511:384], imm8[7:6]) -dst[MAX:512] := 0 +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[k+31:k] := Truncate32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 32*j IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) + dst[l+31:l] := Truncate32(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 32*j IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] + MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) FI ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 32*j IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) + dst[l+31:l] := Truncate32(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+31:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Elementary Math Functions - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note]. +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) + dst[l+15:l] := Truncate16(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+15:l] := src[l+15:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) + FI ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - [round_note]. +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + i := 32*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+31:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) + dst[l+7:l] := Saturate8(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". FOR j := 0 to 15 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) FI ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) + dst[l+7:l] := Saturate8(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) + i := 32*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+31:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - [round_note]. +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := SQRT(b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + Convert + + + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -IF k[0] - dst[63:0] := SQRT(b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) + FI +ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := SQRT(b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + Convert + + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -IF k[0] - dst[63:0] := SQRT(b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := SQRT(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + Convert + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". -IF k[0] - dst[31:0] := SQRT(b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := SQRT(b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + Convert + + + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := SQRT(b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := SQRT(b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] + Convert + + + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -dst[31:0] := SQRT(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) + FI +ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 8*j IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[l+7:l] := Saturate8(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+7:l] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:64] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[k+31:k] := Saturate32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Arithmetic - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 32*j IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] + dst[l+31:l] := Saturate32(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := 64*j + l := 32*j IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI + MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) + FI ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := 64*j + l := 32*j IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[l+31:l] := Saturate32(a[i+63:i]) ELSE - dst[i+31:i] := 0 - FI + dst[l+31:l] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] - b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + Convert + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". -IF k[0] - dst[63:0] := a[63:0] - b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+63:i]) +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] - b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Arithmetic - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + Convert + + + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF k[0] - dst[63:0] := a[63:0] - b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -dst[63:0] := a[63:0] - b[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) + FI +ENDFOR - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := a[31:0] - b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". -IF k[0] - dst[31:0] := a[31:0] - b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 15 + i := 32*j + k := 8*j + dst[i+31:i] := SignExtend32(a[k+7:k]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := a[31:0] - b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -IF k[0] - dst[31:0] := a[31:0] - b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Arithmetic - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". -dst[31:0] := a[31:0] - b[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[i+63:i] := SignExtend64(a[k+7:k]) +ENDFOR +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 8*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := SignExtend64(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 8*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := SignExtend64(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[i+63:i] := SignExtend64(a[k+31:k]) +ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := 64*j + l := 32*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+63:i] := SignExtend64(a[l+31:l]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 16*j + dst[i+31:i] := SignExtend32(a[k+15:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 + l := j*16 IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[i+31:i] := SignExtend32(a[l+15:l]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[i+63:i] := SignExtend64(a[k+15:k]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Swizzle - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := SignExtend64(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 - i := j*64 + i := 64*j + l := 16*j IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] + dst[i+63:i] := SignExtend64(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 +FOR j := 0 to 15 + i := 32*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] + dst[l+7:l] := SaturateU8(a[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[l+7:l] := src[l+7:l] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 - i := j*32 + i := 32*j + l := 8*j IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) FI ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point + AVX512F - Swizzle - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Cast - - - Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point + AVX512F - Cast - - - Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Cast - - - Cast vector of type __m512d to type __m128d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Cast - - - Cast vector of type __m512 to type __m128. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) + FI +ENDFOR + + AVX512F - Cast - - - Cast vector of type __m512d to type __m256d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Cast - - - Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Cast - - - Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Cast - - - Cast vector of type __m512 to type __m256. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Integer + Convert + + + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) + FI +ENDFOR + + AVX512F - Cast - - - Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Integer + Convert + + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + AVX512F - Cast - - - Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Integer + Convert + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[k+31:k] := SaturateU32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Cast - - - Cast vector of type __m512i to type __m128i. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Integer + Convert + + + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Cast - - - Cast vector of type __m512i to type __m256i. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) + FI +ENDFOR + + AVX512F - Cast - - - Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512F - Cast - - - Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Integer + Convert + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Cast - - - Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Cast - - - Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point + Convert + + + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) + FI +ENDFOR + + AVX512F - Cast - - - Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Integer + Convert + + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512F - Cast - - - Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
immintrin.h
-
- - Floating Point - AVX512F - Set - - - Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". + Convert + + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[63:0] +FOR j := 0 to 15 + i := 32*j + k := 8*j + dst[i+31:i] := ZeroExtend32(a[k+7:k]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Set - - - Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[31:0] + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - - Set packed 32-bit integers in "dst" with the repeated 4 element sequence. +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := a -dst[63:32] := b -dst[95:64] := c -dst[127:96] := d -dst[159:128] := a -dst[191:160] := b -dst[223:192] := c -dst[255:224] := d -dst[287:256] := a -dst[319:288] := b -dst[351:320] := c -dst[383:352] := d -dst[415:384] := a -dst[447:416] := b -dst[479:448] := c -dst[511:480] := d +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - - Set packed 64-bit integers in "dst" with the repeated 4 element sequence. +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". -dst[63:0] := a -dst[127:64] := b -dst[191:128] := c -dst[255:192] := d -dst[319:256] := a -dst[383:320] := b -dst[447:384] := c -dst[511:448] := d +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[i+63:i] := ZeroExtend64(a[k+7:k]) +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Set - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence. +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := a -dst[127:64] := b -dst[191:128] := c -dst[255:192] := d -dst[319:256] := a -dst[383:320] := b -dst[447:384] := c -dst[511:448] := d +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Set - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence. +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := a -dst[63:32] := b -dst[95:64] := c -dst[127:96] := d -dst[159:128] := a -dst[191:160] := b -dst[223:192] := c -dst[255:224] := d -dst[287:256] := a -dst[319:288] := b -dst[351:320] := c -dst[383:352] := d -dst[415:384] := a -dst[447:416] := b -dst[479:448] := c -dst[511:480] := d +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values. +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". -dst[7:0] := e0 -dst[15:8] := e1 -dst[23:16] := e2 -dst[31:24] := e3 -dst[39:32] := e4 -dst[47:40] := e5 -dst[55:48] := e6 -dst[63:56] := e7 -dst[71:64] := e8 -dst[79:72] := e9 -dst[87:80] := e10 -dst[95:88] := e11 -dst[103:96] := e12 -dst[111:104] := e13 -dst[119:112] := e14 -dst[127:120] := e15 -dst[135:128] := e16 -dst[143:136] := e17 -dst[151:144] := e18 -dst[159:152] := e19 -dst[167:160] := e20 -dst[175:168] := e21 -dst[183:176] := e22 -dst[191:184] := e23 -dst[199:192] := e24 -dst[207:200] := e25 -dst[215:208] := e26 -dst[223:216] := e27 -dst[231:224] := e28 -dst[239:232] := e29 -dst[247:240] := e30 -dst[255:248] := e31 -dst[263:256] := e32 -dst[271:264] := e33 -dst[279:272] := e34 -dst[287:280] := e35 -dst[295:288] := e36 -dst[303:296] := e37 -dst[311:304] := e38 -dst[319:312] := e39 -dst[327:320] := e40 -dst[335:328] := e41 -dst[343:336] := e42 -dst[351:344] := e43 -dst[359:352] := e44 -dst[367:360] := e45 -dst[375:368] := e46 -dst[383:376] := e47 -dst[391:384] := e48 -dst[399:392] := e49 -dst[407:400] := e50 -dst[415:408] := e51 -dst[423:416] := e52 -dst[431:424] := e53 -dst[439:432] := e54 -dst[447:440] := e55 -dst[455:448] := e56 -dst[463:456] := e57 -dst[471:464] := e58 -dst[479:472] := e59 -dst[487:480] := e60 -dst[495:488] := e61 -dst[503:496] := e62 -dst[511:504] := e63 +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[i+63:i] := ZeroExtend64(a[k+31:k]) +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values. +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[15:0] := e0 -dst[31:16] := e1 -dst[47:32] := e2 -dst[63:48] := e3 -dst[79:64] := e4 -dst[95:80] := e5 -dst[111:96] := e6 -dst[127:112] := e7 -dst[143:128] := e8 -dst[159:144] := e9 -dst[175:160] := e10 -dst[191:176] := e11 -dst[207:192] := e12 -dst[223:208] := e13 -dst[239:224] := e14 -dst[255:240] := e15 -dst[271:256] := e16 -dst[287:272] := e17 -dst[303:288] := e18 -dst[319:304] := e19 -dst[335:320] := e20 -dst[351:336] := e21 -dst[367:352] := e22 -dst[383:368] := e23 -dst[399:384] := e24 -dst[415:400] := e25 -dst[431:416] := e26 -dst[447:432] := e27 -dst[463:448] := e28 -dst[479:464] := e29 -dst[495:480] := e30 -dst[511:496] := e31 +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - - - - - - - - - - - - - - Set packed 32-bit integers in "dst" with the supplied values. +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 -dst[159:128] := e4 -dst[191:160] := e5 -dst[223:192] := e6 -dst[255:224] := e7 -dst[287:256] := e8 -dst[319:288] := e9 -dst[351:320] := e10 -dst[383:352] := e11 -dst[415:384] := e12 -dst[447:416] := e13 -dst[479:448] := e14 -dst[511:480] := e15 +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - - - - - - Set packed 64-bit integers in "dst" with the supplied values. +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". -dst[63:0] := e0 -dst[127:64] := e1 -dst[191:128] := e2 -dst[255:192] := e3 -dst[319:256] := e4 -dst[383:320] := e5 -dst[447:384] := e6 -dst[511:448] := e7 +FOR j := 0 to 15 + i := 32*j + k := 16*j + dst[i+31:i] := ZeroExtend32(a[k+15:k]) +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Set - - - - - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := e0 -dst[127:64] := e1 -dst[191:128] := e2 -dst[255:192] := e3 -dst[319:256] := e4 -dst[383:320] := e5 -dst[447:384] := e6 -dst[511:448] := e7 +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Set - - - - - - - - - - - - - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 -dst[159:128] := e4 -dst[191:160] := e5 -dst[223:192] := e6 -dst[255:224] := e7 -dst[287:256] := e8 -dst[319:288] := e9 -dst[351:320] := e10 -dst[383:352] := e11 -dst[415:384] := e12 -dst[447:416] := e13 -dst[479:448] := e14 -dst[511:480] := e15 +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - - Set packed 32-bit integers in "dst" with the repeated 4 element sequence in reverse order. +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". -dst[31:0] := d -dst[63:32] := c -dst[95:64] := b -dst[127:96] := a -dst[159:128] := d -dst[191:160] := c -dst[223:192] := b -dst[255:224] := a -dst[287:256] := d -dst[319:288] := c -dst[351:320] := b -dst[383:352] := a -dst[415:384] := d -dst[447:416] := c -dst[479:448] := b -dst[511:480] := a +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[i+63:i] := ZeroExtend64(a[k+15:k]) +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - - Set packed 64-bit integers in "dst" with the repeated 4 element sequence in reverse order. +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := d -dst[127:64] := c -dst[191:128] := b -dst[255:192] := a -dst[319:256] := d -dst[383:320] := c -dst[447:384] := b -dst[511:448] := a +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Set - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order. +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := d -dst[127:64] := c -dst[191:128] := b -dst[255:192] := a -dst[319:256] := d -dst[383:320] := c -dst[447:384] := b -dst[511:448] := a +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Convert + + + + + Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". + +dst[31:0] := a[31:0] + + AVX512F - Set - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order. +
immintrin.h
+ Convert +
+ + + + Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". -dst[31:0] := d -dst[63:32] := c -dst[95:64] := b -dst[127:96] := a -dst[159:128] := d -dst[191:160] := c -dst[223:192] := b -dst[255:224] := a -dst[287:256] := d -dst[319:288] := c -dst[351:320] := b -dst[383:352] := a -dst[415:384] := d -dst[447:416] := c -dst[479:448] := b -dst[511:480] := a -dst[MAX:512] := 0 +dst[63:0] := a[63:0] + + AVX512F
immintrin.h
-
- - Integer + Convert + + + + + Copy the lower 32-bit integer in "a" to "dst". + +dst[31:0] := a[31:0] + + AVX512F - Set - - - - - - - - - - - - - - - - - - Set packed 32-bit integers in "dst" with the supplied values in reverse order. +
immintrin.h
+ Convert +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] -dst[31:0] := e15 -dst[63:32] := e14 -dst[95:64] := e13 -dst[127:96] := e12 -dst[159:128] := e11 -dst[191:160] := e10 -dst[223:192] := e9 -dst[255:224] := e8 -dst[287:256] := e7 -dst[319:288] := e6 -dst[351:320] := e5 -dst[383:352] := e4 -dst[415:384] := e3 -dst[447:416] := e2 -dst[479:448] := e1 -dst[511:480] := e0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Set - - - - - - - - - - Set packed 64-bit integers in "dst" with the supplied values in reverse order. +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][max_float_note] -dst[63:0] := e7 -dst[127:64] := e6 -dst[191:128] := e5 -dst[255:192] := e4 -dst[319:256] := e3 -dst[383:320] := e2 -dst[447:384] := e1 -dst[511:448] := e0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Set - - - - - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] -dst[63:0] := e7 -dst[127:64] := e6 -dst[191:128] := e5 -dst[255:192] := e4 -dst[319:256] := e3 -dst[383:320] := e2 -dst[447:384] := e1 -dst[511:448] := e0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Set - - - - - - - - - - - - - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][max_float_note] -dst[31:0] := e15 -dst[63:32] := e14 -dst[95:64] := e13 -dst[127:96] := e12 -dst[159:128] := e11 -dst[191:160] := e10 -dst[223:192] := e9 -dst[255:224] := e8 -dst[287:256] := e7 -dst[319:288] := e6 -dst[351:320] := e5 -dst[383:352] := e4 -dst[415:384] := e3 -dst[447:416] := e2 -dst[479:448] := e1 -dst[511:480] := e0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - AVX512F - Set - - - Return vector of type __m512 with all elements set to zero. - -dst[MAX:0] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Set - - Return vector of type __m512i with all elements set to zero. - -dst[MAX:0] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Set - - Return vector of type __m512d with all elements set to zero. - -dst[MAX:0] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F - Set - - Return vector of type __m512 with all elements set to zero. - -dst[MAX:0] := 0 - - -
immintrin.h
-
- - Integer - AVX512F - Set - - Return vector of type __m512i with all elements set to zero. - -dst[MAX:0] := 0 - - -
immintrin.h
-
- - AVX512F - General Support - - - Return vector of type __m512 with undefined elements. -
immintrin.h
-
- - Integer + AVX512F - General Support - - Return vector of type __m512i with undefined elements. -
immintrin.h
-
- - Floating Point - AVX512F - General Support - - Return vector of type __m512d with undefined elements. -
immintrin.h
-
- - Floating Point - AVX512F - General Support - - Return vector of type __m512 with undefined elements.
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + Special Math Functions + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] FOR j := 0 to 7 i := j*64 - dst[i+63:i] := ACOS(a[i+63:i]) + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note][max_float_note] FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := ACOS(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ACOS(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ACOS(a[i+31:i]) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ACOSH(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][max_float_note] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := ACOSH(a[i+63:i]) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ACOSH(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ACOSH(a[i+31:i]) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ASIN(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][max_float_note] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := ASIN(a[i+63:i]) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ASIN(a[i+31:i]) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note][max_float_note] FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := ASIN(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note] -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ASINH(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := MAX(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := MAX(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Trigonometry - - - - - Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ASINH(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := MAX(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := MAX(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Trigonometry - - - Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note] -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ASINH(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 +dst[63:0] := MAX(a[63:0], b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] + +IF k[0] + dst[31:0] := MAX(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Trigonometry - - - - - Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ASINH(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[31:0] := MAX(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] + +IF k[0] + dst[31:0] := MAX(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Trigonometry - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[31:0] := MAX(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] + +dst[31:0] := MAX(a[31:0], b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Trigonometry - - - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][min_float_note] -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians. - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ATAN(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ATAN(a[i+63:i]) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ATAN(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][min_float_note] -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := ATAN(a[i+31:i]) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians. +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] FOR j := 0 to 7 i := j*64 - dst[i+63:i] := ATANH(a[i+63:i]) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note][min_float_note] FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := ATANH(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ATANH(a[i+31:i]) + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][min_float_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ATANH(a[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := CubeRoot(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 + Special Math Functions + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := CubeRoot(a[i+63:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := CubeRoot(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Special Math Functions + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][min_float_note] + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := CubeRoot(a[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := CDFNormal(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Probability/Statistics - - - - - Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := CDFNormal(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + Special Math Functions + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 15 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note][min_float_note] + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := CDFNormal(a[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - - - Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := CDFNormal(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Probability/Statistics - - - Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := InverseCDFNormal(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 + Special Math Functions + + + + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note] + +IF k[0] + dst[63:0] := MIN(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - - - Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := InverseCDFNormal(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := MIN(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Probability/Statistics - - - Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := InverseCDFNormal(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note] + +IF k[0] + dst[63:0] := MIN(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := MIN(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Probability/Statistics - - - - - Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := InverseCDFNormal(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" , and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note] + +dst[63:0] := MIN(a[63:0], b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] + +IF k[0] + dst[31:0] := MIN(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F +
immintrin.h
Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". +
+ + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[31:0] := MIN(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] + +IF k[0] + dst[31:0] := MIN(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F +
immintrin.h
Special Math Functions - - - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := CEIL(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[31:0] := MIN(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Special Math Functions + + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] + +dst[31:0] := MIN(a[31:0], b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F +
immintrin.h
Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". +
+ + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) + dst[i+31:i] := ABS(a[i+31:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F +
immintrin.h
Special Math Functions - - - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := CEIL(a[i+31:i]) + dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := COS(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := COS(a[i+63:i]) + dst[i+31:i] := ABS(a[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := COS(a[i+31:i]) +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ABS(a[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := COS(a[i+31:i]) + dst[i+63:i] := ABS(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := COSD(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 + Special Math Functions + + + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := COSD(a[i+63:i]) + dst[i+63:i] := ABS(a[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := COSD(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Special Math Functions + + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := COSD(a[i+31:i]) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - dst[i+63:i] := COSH(a[i+63:i]) + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := COSH(a[i+63:i]) + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := COSH(a[i+31:i]) +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := COSH(a[i+31:i]) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ERF(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Probability/Statistics - - - - - Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 + Special Math Functions + + + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ERF(a[i+63:i]) + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := 1.0 - ERF(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Probability/Statistics - - - - - Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 + Special Math Functions + + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := 1.0 - ERF(a[i+63:i]) + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ERF(a[i+31:i]) +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - - - Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ERF(a[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+63:i] := 1.0 - ERF(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Probability/Statistics - - - - - Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 + Special Math Functions + + + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+63:i] := 1.0 - ERF(a[i+31:i]) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := 1.0 / ERF(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Probability/Statistics - - - - - Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 + Special Math Functions + + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := 1.0 / ERF(a[i+63:i]) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+63:i] := 1.0 / ERF(a[i+31:i]) +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - - - Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+63:i] := 1.0 / ERF(a[i+31:i]) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Probability/Statistics - - - - - Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 + Special Math Functions + + + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Probability/Statistics - - - Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Probability/Statistics - - - - - Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 + Special Math Functions + + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". FOR j := 0 to 7 i := j*64 - dst[i+63:i] := POW(10.0, a[i+63:i]) + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := POW(10.0, a[i+63:i]) + dst[i+63:i] := a[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Move +
+ + + + + Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +tmp[191:128] := a[191:128] +tmp[255:192] := a[191:128] +tmp[319:256] := a[319:256] +tmp[383:320] := a[319:256] +tmp[447:384] := a[447:384] +tmp[511:448] := a[447:384] +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) + dst[i+63:i] := tmp[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POW(2.0, a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Move + + + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +tmp[191:128] := a[191:128] +tmp[255:192] := a[191:128] +tmp[319:256] := a[319:256] +tmp[383:320] := a[319:256] +tmp[447:384] := a[447:384] +tmp[511:448] := a[447:384] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := POW(2.0, a[i+63:i]) + dst[i+63:i] := tmp[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Move +
+ + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) -ENDFOR +dst[63:0] := a[63:0] +dst[127:64] := a[63:0] +dst[191:128] := a[191:128] +dst[255:192] := a[191:128] +dst[319:256] := a[319:256] +dst[383:320] := a[319:256] +dst[447:384] := a[447:384] +dst[511:448] := a[447:384] dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Move +
+ + + + + Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) + dst[i+31:i] := a[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Move + + + + + + Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := POW(e, a[i+63:i]) + dst[i+63:i] := a[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Move +
+ + + + + + + Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Move + + + + + + + Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + AVX512F - Elementary Math Functions - - - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Move +
+ + + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +tmp[159:128] := a[191:160] +tmp[191:160] := a[191:160] +tmp[223:192] := a[255:224] +tmp[255:224] := a[255:224] +tmp[287:256] := a[319:288] +tmp[319:288] := a[319:288] +tmp[351:320] := a[383:352] +tmp[383:352] := a[383:352] +tmp[415:384] := a[447:416] +tmp[447:416] := a[447:416] +tmp[479:448] := a[511:480] +tmp[511:480] := a[511:480] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := POW(FP32(e), a[i+31:i]) + dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Move + + + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +tmp[159:128] := a[191:160] +tmp[191:160] := a[191:160] +tmp[223:192] := a[255:224] +tmp[255:224] := a[255:224] +tmp[287:256] := a[319:288] +tmp[319:288] := a[319:288] +tmp[351:320] := a[383:352] +tmp[383:352] := a[383:352] +tmp[415:384] := a[447:416] +tmp[447:416] := a[447:416] +tmp[479:448] := a[511:480] +tmp[511:480] := a[511:480] +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Move + + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". + +dst[31:0] := a[63:32] +dst[63:32] := a[63:32] +dst[95:64] := a[127:96] +dst[127:96] := a[127:96] +dst[159:128] := a[191:160] +dst[191:160] := a[191:160] +dst[223:192] := a[255:224] +dst[255:224] := a[255:224] +dst[287:256] := a[319:288] +dst[319:288] := a[319:288] +dst[351:320] := a[383:352] +dst[383:352] := a[383:352] +dst[415:384] := a[447:416] +dst[447:416] := a[447:416] +dst[479:448] := a[511:480] +dst[511:480] := a[511:480] +dst[MAX:512] := 0 + + AVX512F - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +tmp[159:128] := a[159:128] +tmp[191:160] := a[159:128] +tmp[223:192] := a[223:192] +tmp[255:224] := a[223:192] +tmp[287:256] := a[287:256] +tmp[319:288] := a[287:256] +tmp[351:320] := a[351:320] +tmp[383:352] := a[351:320] +tmp[415:384] := a[415:384] +tmp[447:416] := a[415:384] +tmp[479:448] := a[479:448] +tmp[511:480] := a[479:448] FOR j := 0 to 15 i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 -ENDFOR + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Move +
+ + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +tmp[159:128] := a[159:128] +tmp[191:160] := a[159:128] +tmp[223:192] := a[223:192] +tmp[255:224] := a[223:192] +tmp[287:256] := a[287:256] +tmp[319:288] := a[287:256] +tmp[351:320] := a[351:320] +tmp[383:352] := a[351:320] +tmp[415:384] := a[415:384] +tmp[447:416] := a[415:384] +tmp[479:448] := a[479:448] +tmp[511:480] := a[479:448] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 + dst[i+31:i] := tmp[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". +
immintrin.h
+ Move +
+ + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) -ENDFOR +dst[31:0] := a[31:0] +dst[63:32] := a[31:0] +dst[95:64] := a[95:64] +dst[127:96] := a[95:64] +dst[159:128] := a[159:128] +dst[191:160] := a[159:128] +dst[223:192] := a[223:192] +dst[255:224] := a[223:192] +dst[287:256] := a[287:256] +dst[319:288] := a[287:256] +dst[351:320] := a[351:320] +dst[383:352] := a[351:320] +dst[415:384] := a[415:384] +dst[447:416] := a[415:384] +dst[479:448] := a[479:448] +dst[511:480] := a[479:448] dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Move +
+ + + + + + + Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := FLOOR(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[31:0] := b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Move + + + + + + + Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + AVX512F - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". +
immintrin.h
+ Move +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := FLOOR(a[i+31:i]) + dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) + IF k[j] + dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) + dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j - IF b[i+31:i] == 0 - #DE +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - - - Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := 32*j + i := j*32 IF k[j] - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) + FOR h := 0 to 31 + index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 63 - i := 8*j - IF b[i+7:i] == 0 - #DE +FOR j := 0 to 15 + i := j*32 + IF k[j] + FOR h := 0 to 31 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR + ELSE + dst[i+31:i] := 0 FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". -FOR j := 0 to 31 - i := 16*j - IF b[i+15:i] == 0 - #DE - FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +FOR j := 0 to 15 + i := j*32 + FOR h := 0 to 31 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := 64*j - IF b[i+63:i] == 0 - #DE + i := j*64 + IF k[j] + FOR h := 0 to 63 + index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR + ELSE + dst[i+63:i] := src[i+63:i] FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := InvSQRT(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 + Logical + + + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := InvSQRT(a[i+63:i]) + FOR h := 0 to 63 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := InvSQRT(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := InvSQRT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 63 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 31 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 7 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + Logical + + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst". FOR j := 0 to 7 i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) + FOR h := 0 to 63 + index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] + dst[i+h] := imm8[index[2:0]] + ENDFOR ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ELSE - dst[i+63:i] := src[i+63:i] + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) +FOR j := 0 to 7 + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ELSE - dst[i+31:i] := src[i+31:i] + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LOG(1.0 + a[i+63:i]) +FOR j := 0 to 15 + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := LOG(1.0 + a[i+63:i]) + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ELSE - dst[i+63:i] := src[i+63:i] + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LOG(1.0 + a[i+31:i]) +FOR j := 0 to 7 + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - - Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := LOG(1.0 + a[i+31:i]) + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + Broadcast 8-bit integer "a" to all elements of "dst". -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := LOG(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := a[7:0] ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Set + + + + + + + Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := LOG(a[i+31:i]) + dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*64 + Set + + + + + + Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+31:i] := a[31:0] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 + Set + + + + + Broadcast 32-bit integer "a" to all elements of "dst". + +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := NearbyInt(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 + Set + + + + + + + Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := NearbyInt(a[i+63:i]) + dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := NearbyInt(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 + Set + + + + + + Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := NearbyInt(a[i+31:i]) + dst[i+63:i] := a[63:0] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - - Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Set + + + + + Broadcast 64-bit integer "a" to all elements of "dst". FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - - Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - - Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Set + + + + + Broadcast the low packed 16-bit integer from "a" to all all elements of "dst". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := a[15:0] ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Elementary Math Functions - - - Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Elementary Math Functions - - - - - Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Set + + + + + Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point AVX512F - Elementary Math Functions - - - Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst". +
immintrin.h
+ Set +
+ + + + Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) + dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point AVX512F - Elementary Math Functions - - - - - Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Set +
+ + + + + + + Set packed 32-bit integers in "dst" with the repeated 4 element sequence. -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR +dst[31:0] := a +dst[63:32] := b +dst[95:64] := c +dst[127:96] := d +dst[159:128] := a +dst[191:160] := b +dst[223:192] := c +dst[255:224] := d +dst[287:256] := a +dst[319:288] := b +dst[351:320] := c +dst[383:352] := d +dst[415:384] := a +dst[447:416] := b +dst[479:448] := c +dst[511:480] := d dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point AVX512F - Special Math Functions - - - Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RoundToNearestEven(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundToNearestEven(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR + Set + + + + + + + + Set packed 64-bit integers in "dst" with the repeated 4 element sequence. + +dst[63:0] := a +dst[127:64] := b +dst[191:128] := c +dst[255:192] := d +dst[319:256] := a +dst[383:320] := b +dst[447:384] := c +dst[511:448] := d dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point AVX512F - Special Math Functions - - - Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RoundToNearestEven(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundToNearestEven(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR + Set + + + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence. + +dst[63:0] := a +dst[127:64] := b +dst[191:128] := c +dst[255:192] := d +dst[319:256] := a +dst[383:320] := b +dst[447:384] := c +dst[511:448] := d dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point AVX512F - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". +
immintrin.h
+ Set +
+ + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence. -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i]) -ENDFOR +dst[31:0] := a +dst[63:32] := b +dst[95:64] := c +dst[127:96] := d +dst[159:128] := a +dst[191:160] := b +dst[223:192] := c +dst[255:224] := d +dst[287:256] := a +dst[319:288] := b +dst[351:320] := c +dst[383:352] := d +dst[415:384] := a +dst[447:416] := b +dst[479:448] := c +dst[511:480] := d dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point AVX512F - Special Math Functions - - - - - Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values. -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ROUND(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR +dst[7:0] := e0 +dst[15:8] := e1 +dst[23:16] := e2 +dst[31:24] := e3 +dst[39:32] := e4 +dst[47:40] := e5 +dst[55:48] := e6 +dst[63:56] := e7 +dst[71:64] := e8 +dst[79:72] := e9 +dst[87:80] := e10 +dst[95:88] := e11 +dst[103:96] := e12 +dst[111:104] := e13 +dst[119:112] := e14 +dst[127:120] := e15 +dst[135:128] := e16 +dst[143:136] := e17 +dst[151:144] := e18 +dst[159:152] := e19 +dst[167:160] := e20 +dst[175:168] := e21 +dst[183:176] := e22 +dst[191:184] := e23 +dst[199:192] := e24 +dst[207:200] := e25 +dst[215:208] := e26 +dst[223:216] := e27 +dst[231:224] := e28 +dst[239:232] := e29 +dst[247:240] := e30 +dst[255:248] := e31 +dst[263:256] := e32 +dst[271:264] := e33 +dst[279:272] := e34 +dst[287:280] := e35 +dst[295:288] := e36 +dst[303:296] := e37 +dst[311:304] := e38 +dst[319:312] := e39 +dst[327:320] := e40 +dst[335:328] := e41 +dst[343:336] := e42 +dst[351:344] := e43 +dst[359:352] := e44 +dst[367:360] := e45 +dst[375:368] := e46 +dst[383:376] := e47 +dst[391:384] := e48 +dst[399:392] := e49 +dst[407:400] := e50 +dst[415:408] := e51 +dst[423:416] := e52 +dst[431:424] := e53 +dst[439:432] := e54 +dst[447:440] := e55 +dst[455:448] := e56 +dst[463:456] := e57 +dst[471:464] := e58 +dst[479:472] := e59 +dst[487:480] := e60 +dst[495:488] := e61 +dst[503:496] := e62 +dst[511:504] := e63 dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point AVX512F - Trigonometry - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values. -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) -ENDFOR +dst[15:0] := e0 +dst[31:16] := e1 +dst[47:32] := e2 +dst[63:48] := e3 +dst[79:64] := e4 +dst[95:80] := e5 +dst[111:96] := e6 +dst[127:112] := e7 +dst[143:128] := e8 +dst[159:144] := e9 +dst[175:160] := e10 +dst[191:176] := e11 +dst[207:192] := e12 +dst[223:208] := e13 +dst[239:224] := e14 +dst[255:240] := e15 +dst[271:256] := e16 +dst[287:272] := e17 +dst[303:288] := e18 +dst[319:304] := e19 +dst[335:320] := e20 +dst[351:336] := e21 +dst[367:352] := e22 +dst[383:368] := e23 +dst[399:384] := e24 +dst[415:400] := e25 +dst[431:416] := e26 +dst[447:432] := e27 +dst[463:448] := e28 +dst[479:464] := e29 +dst[495:480] := e30 +dst[511:496] := e31 dst[MAX:512] := 0 + AVX512F
immintrin.h
-
- - Floating Point + Set + + + + + + + + + + + + + + + + + + + + Set packed 32-bit integers in "dst" with the supplied values. + +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 +dst[159:128] := e4 +dst[191:160] := e5 +dst[223:192] := e6 +dst[255:224] := e7 +dst[287:256] := e8 +dst[319:288] := e9 +dst[351:320] := e10 +dst[383:352] := e11 +dst[415:384] := e12 +dst[447:416] := e13 +dst[479:448] := e14 +dst[511:480] := e15 +dst[MAX:512] := 0 + AVX512F - Trigonometry - - - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed 64-bit integers in "dst" with the supplied values. -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SIN(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR +dst[63:0] := e0 +dst[127:64] := e1 +dst[191:128] := e2 +dst[255:192] := e3 +dst[319:256] := e4 +dst[383:320] := e5 +dst[447:384] := e6 +dst[511:448] := e7 dst[MAX:512] := 0 + AVX512F
immintrin.h
-
- - Floating Point + Set + + + + + + + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 +dst[191:128] := e2 +dst[255:192] := e3 +dst[319:256] := e4 +dst[383:320] := e5 +dst[447:384] := e6 +dst[511:448] := e7 +dst[MAX:512] := 0 + AVX512F - Trigonometry - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) -ENDFOR +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 +dst[159:128] := e4 +dst[191:160] := e5 +dst[223:192] := e6 +dst[255:224] := e7 +dst[287:256] := e8 +dst[319:288] := e9 +dst[351:320] := e10 +dst[383:352] := e11 +dst[415:384] := e12 +dst[447:416] := e13 +dst[479:448] := e14 +dst[511:480] := e15 dst[MAX:512] := 0 + AVX512F
immintrin.h
-
- - Floating Point + Set + + + + + + + + Set packed 32-bit integers in "dst" with the repeated 4 element sequence in reverse order. + +dst[31:0] := d +dst[63:32] := c +dst[95:64] := b +dst[127:96] := a +dst[159:128] := d +dst[191:160] := c +dst[223:192] := b +dst[255:224] := a +dst[287:256] := d +dst[319:288] := c +dst[351:320] := b +dst[383:352] := a +dst[415:384] := d +dst[447:416] := c +dst[479:448] := b +dst[511:480] := a +dst[MAX:512] := 0 + AVX512F - Trigonometry - - - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Set +
+ + + + + + + Set packed 64-bit integers in "dst" with the repeated 4 element sequence in reverse order. -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SIN(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR +dst[63:0] := d +dst[127:64] := c +dst[191:128] := b +dst[255:192] := a +dst[319:256] := d +dst[383:320] := c +dst[447:384] := b +dst[511:448] := a dst[MAX:512] := 0 + AVX512F
immintrin.h
-
- - Floating Point + Set + + + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order. + +dst[63:0] := d +dst[127:64] := c +dst[191:128] := b +dst[255:192] := a +dst[319:256] := d +dst[383:320] := c +dst[447:384] := b +dst[511:448] := a +dst[MAX:512] := 0 + AVX512F - Trigonometry - - - Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Set +
+ + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order. -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SINH(a[i+63:i]) -ENDFOR +dst[31:0] := d +dst[63:32] := c +dst[95:64] := b +dst[127:96] := a +dst[159:128] := d +dst[191:160] := c +dst[223:192] := b +dst[255:224] := a +dst[287:256] := d +dst[319:288] := c +dst[351:320] := b +dst[383:352] := a +dst[415:384] := d +dst[447:416] := c +dst[479:448] := b +dst[511:480] := a dst[MAX:512] := 0 + AVX512F
immintrin.h
-
- - Floating Point + Set + + + + + + + + + + + + + + + + + + + + Set packed 32-bit integers in "dst" with the supplied values in reverse order. + +dst[31:0] := e15 +dst[63:32] := e14 +dst[95:64] := e13 +dst[127:96] := e12 +dst[159:128] := e11 +dst[191:160] := e10 +dst[223:192] := e9 +dst[255:224] := e8 +dst[287:256] := e7 +dst[319:288] := e6 +dst[351:320] := e5 +dst[383:352] := e4 +dst[415:384] := e3 +dst[447:416] := e2 +dst[479:448] := e1 +dst[511:480] := e0 +dst[MAX:512] := 0 + AVX512F - Trigonometry - - - - - Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed 64-bit integers in "dst" with the supplied values in reverse order. -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SINH(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR +dst[63:0] := e7 +dst[127:64] := e6 +dst[191:128] := e5 +dst[255:192] := e4 +dst[319:256] := e3 +dst[383:320] := e2 +dst[447:384] := e1 +dst[511:448] := e0 dst[MAX:512] := 0 + AVX512F
immintrin.h
-
- - Floating Point + Set + + + + + + + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst[63:0] := e7 +dst[127:64] := e6 +dst[191:128] := e5 +dst[255:192] := e4 +dst[319:256] := e3 +dst[383:320] := e2 +dst[447:384] := e1 +dst[511:448] := e0 +dst[MAX:512] := 0 + AVX512F - Trigonometry - - - Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SINH(a[i+31:i]) -ENDFOR +dst[31:0] := e15 +dst[63:32] := e14 +dst[95:64] := e13 +dst[127:96] := e12 +dst[159:128] := e11 +dst[191:160] := e10 +dst[223:192] := e9 +dst[255:224] := e8 +dst[287:256] := e7 +dst[319:288] := e6 +dst[351:320] := e5 +dst[383:352] := e4 +dst[415:384] := e3 +dst[447:416] := e2 +dst[479:448] := e1 +dst[511:480] := e0 dst[MAX:512] := 0 + AVX512F
immintrin.h
-
- - Floating Point + Set + + + + + Return vector of type __m512 with all elements set to zero. + +dst[MAX:0] := 0 + + AVX512F - Trigonometry - - - - - Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Set +
+ + + Return vector of type __m512i with all elements set to zero. -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SINH(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +dst[MAX:0] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Set + + + + Return vector of type __m512d with all elements set to zero. + +dst[MAX:0] := 0 + + AVX512F - Trigonometry - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SIND(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 +
immintrin.h
+ Set +
+ + + Return vector of type __m512 with all elements set to zero. + +dst[MAX:0] := 0 + + AVX512F
immintrin.h
-
- - Floating Point + Set + + + + Return vector of type __m512i with all elements set to zero. + +dst[MAX:0] := 0 + + AVX512F - Trigonometry - - - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 +
immintrin.h
+ Set +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := SIND(a[i+63:i]) + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SIND(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Shift + + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := SIND(a[i+31:i]) + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := TAN(a[i+63:i]) +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := TAN(a[i+63:i]) + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := TAN(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Shift + + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := TAN(a[i+31:i]) + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 i := j*64 - dst[i+63:i] := TAND(a[i+63:i]) + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := TAND(a[i+63:i]) + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := TAND(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Shift + + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := TAND(a[i+31:i]) + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := TANH(a[i+63:i]) +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := TANH(a[i+63:i]) + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := TANH(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Trigonometry - - - - - Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Shift + + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := TANH(a[i+31:i]) + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". - FOR j := 0 to 7 +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 i := j*64 - dst[i+63:i] := TRUNCATE(a[i+63:i]) + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - - - Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := TRUNCATE(a[i+63:i]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Special Math Functions - - - Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := TRUNCATE(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point - AVX512F - Special Math Functions - - - - - Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Shift + + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := TRUNCATE(a[i+31:i]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} FOR j := 0 to 15 - i := 32*j - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := 32*j +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 7 + i := j*64 IF k[j] - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 63 - i := 8*j - IF b[i+7:i] == 0 - #DE - FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + Shift + + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := 16*j - IF b[i+15:i] == 0 - #DE +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} FOR j := 0 to 7 - i := 64*j - IF b[i+63:i] == 0 - #DE - FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := 32*j + Shift + + + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 63 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer + AVX512F - Arithmetic - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 31 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
-
- - Integer - AVX512F - Arithmetic - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 7 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) + Shift + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Mask - AVX512F - Mask - - - - Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0. - dst[15:0] := k1[15:0] | k2[15:0] -IF dst == 0 - SetZF() -FI - - -
immintrin.h
-
- - Mask - AVX512F - Mask - - - - Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's. - dst[15:0] := k1[15:0] | k2[15:0] -IF PopCount(dst[15:0]) == 16 - SetCF() -FI - - -
immintrin.h
-
- - AVX512F - Mask - - - Converts bit mask "k1" into an integer value, storing the results in "dst". - -dst := ZeroExtend32(k1) - - -
immintrin.h
-
- + AVX512F - Mask - - - Converts integer "mask" into bitmask, storing the result in "dst". - -dst := mask[15:0] - -
immintrin.h
-
- - Integer - AVX512F - Store - - - - Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst". + Shift + + + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} FOR j := 0 to 7 i := j*64 - dst[i+63:i] := a[i+63:i] * b[i+63:i] + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Integer + AVX512F - Store - - - - - - Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} FOR j := 0 to 7 i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) - MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0 -cos_res[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - - - Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := SIN(a[i+63:i]) - MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI ELSE - dst[i+63:i] := sin_src[i+63:i] - MEM[mem_addr+i+63:mem_addr+i] := cos_src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 -cos_res[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:512] := 0 -cos_res[MAX:512] := 0 -
immintrin.h
-
- - Floating Point + AVX512F - Trigonometry - - - - - - - Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := SIN(a[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI ELSE - dst[i+31:i] := sin_src[i+31:i] - MEM[mem_addr+i+31:mem_addr+i] := cos_src[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 -cos_res[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". - -dst[31:0] := a[31:0] - -
immintrin.h
-
- - Floating Point - AVX512F - Convert - - - Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". - -dst[63:0] := a[63:0] - - -
immintrin.h
-
- - Integer + AVX512F - Convert - - - Copy the lower 32-bit integer in "a" to "dst". - -dst[31:0] := a[31:0] - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + Shift + + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] +FOR j := 0 to 15 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - [round_note] + Shift + + + + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Shift + + + + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + Shift + + + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + Shift + + + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - [round_note] + Shift + + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] +FOR j := 0 to 7 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Shift + + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] +FOR j := 0 to 7 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := 0 ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + Shift + + + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Miscellaneous - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst". + Shift + + + + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) -dst[511:0] := temp[511:0] +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Miscellaneous - - - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Shift + + + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := temp[i+31:i] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Swizzle - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + Shift + + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Swizzle - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + Shift + + + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := b[i+31:i] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 + Shift + + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 + Shift + + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". + Shift + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 +FOR j := 0 to 15 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". + Shift + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 7 i := j*64 - k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". - FOR j := 0 to 7 + Shift + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 i := j*64 - k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". - FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 + Shift + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI ELSE - k[j] := 0 - FI + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC + Shift + + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 - ELSE - k[j] := 0 + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) FI -ENDFOR -k[MAX:8] := 0 +ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 - ELSE - k[j] := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - FOR j := 0 to 7 + Shift + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 - ELSE - k[j] := 0 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - FOR j := 0 to 7 + Shift + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 i := j*64 - IF k1[j] - k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 - ELSE - k[j] := 0 + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) FI ENDFOR -k[MAX:8] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC + Shift + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 15 i := j*32 - k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 + Shift + + + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". + Shift + + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + Shift + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 +FOR j := 0 to 7 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". + Shift + + + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". + Elementary Math Functions + + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". + Elementary Math Functions + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". + Elementary Math Functions + + + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 15 i := j*32 - k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". - FOR j := 0 to 15 + Elementary Math Functions + + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 15 i := j*32 - k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". - FOR j := 0 to 15 + Elementary Math Functions + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 15 i := j*32 - k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 + dst[i+31:i] := (1.0 / a[i+31:i]) ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 + Elementary Math Functions + + + + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. + +IF k[0] + dst[63:0] := (1.0 / b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 +IF k[0] + dst[63:0] := (1.0 / b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 +dst[63:0] := (1.0 / b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Floating Point - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 +IF k[0] + dst[31:0] := (1.0 / b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + Elementary Math Functions + + + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[31:0] := (1.0 / b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - [round_note] + Elementary Math Functions + + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:512] := 0 +dst[31:0] := (1.0 / b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) ELSE - dst[i+63:i] := c[i+63:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] + Elementary Math Functions + + + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + ELSE + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Elementary Math Functions + + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] + Elementary Math Functions + + + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + Elementary Math Functions + + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + IF k[j] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + ELSE + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - [round_note] + Elementary Math Functions + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := (1.0 / SQRT(b[63:0])) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] + Elementary Math Functions + + + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := (1.0 / SQRT(b[63:0])) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +dst[63:0] := (1.0 / SQRT(b[63:0])) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] + Elementary Math Functions + + + + + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[31:0] := (1.0 / SQRT(b[31:0])) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + Elementary Math Functions + + + + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[31:0] := (1.0 / SQRT(b[31:0])) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - [round_note] + Elementary Math Functions + + + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:512] := 0 +dst[31:0] := (1.0 / SQRT(b[31:0])) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := SQRT(a[i+63:i]) ELSE - dst[i+63:i] := c[i+63:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + Elementary Math Functions + + + + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := SQRT(a[i+63:i]) ELSE - dst[i+63:i] := c[i+63:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := SQRT(a[i+63:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + Elementary Math Functions + + + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note]. FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := SQRT(a[i+63:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + Elementary Math Functions + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - [round_note] + Elementary Math Functions + + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + [round_note]. -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := SQRT(a[i+31:i]) ELSE - dst[i+31:i] := c[i+31:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + Elementary Math Functions + + + + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := SQRT(a[i+31:i]) ELSE - dst[i+31:i] := c[i+31:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := SQRT(a[i+31:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + Elementary Math Functions + + + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := SQRT(a[i+31:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + Elementary Math Functions + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - [round_note] + Elementary Math Functions + + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + [round_note]. -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + Elementary Math Functions + + + + + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +IF k[0] + dst[63:0] := SQRT(b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + Elementary Math Functions + + + + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := SQRT(b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := SQRT(b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := SQRT(b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := SQRT(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := SQRT(b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := SQRT(b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := SQRT(b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := SQRT(b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := SQRT(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512d to type __m128d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m128. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512d to type __m256d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m256. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m128i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m256i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Return vector of type __m512 with undefined elements. + AVX512F +
immintrin.h
+ General Support +
+ + + Return vector of type __m512i with undefined elements. + AVX512F +
immintrin.h
+ General Support +
+ + + Return vector of type __m512d with undefined elements. + AVX512F +
immintrin.h
+ General Support +
+ + + Return vector of type __m512 with undefined elements. + AVX512F +
immintrin.h
+ General Support +
+ + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + + + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := src[i+63:i] FI -ENDFOR -dst[MAX:512] := 0 - - - - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + + + + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_note] -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + + + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := src[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + + + + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := src[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". FOR j := 0 to 7 i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". [round_note] FOR j := 0 to 7 i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". [round_note] FOR j := 0 to 15 i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Load - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + Arithmetic + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Load - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + [round_note] -FOR j := 0 to 15 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 7 + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 7 + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 + Arithmetic + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 15 + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 15 + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) -ENDFOR -dst[MAX:512] := 0 - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 - i := j*64 + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 7 - i := j*64 + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) -ENDFOR -dst[MAX:512] := 0 - - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) -ENDFOR + Arithmetic + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Miscellaneous - - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 15 - i := j*32 + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := c[i+63:i] FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Load - - - Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -dst[511:0] := MEM[mem_addr+511:mem_addr] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Load - - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Move - - - - - Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". -MEM[mem_addr+511:mem_addr] := a[511:0] +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Load - - - Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + [round_note] -dst[511:0] := MEM[mem_addr+511:mem_addr] +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Load - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Move - - - - - Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] -MEM[mem_addr+511:mem_addr] := a[511:0] +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Load - - - Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". -dst[511:0] := MEM[mem_addr+511:mem_addr] +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Load - - - Load 512-bits of integer data from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + [round_note] -dst[511:0] := MEM[mem_addr+511:mem_addr] +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Load - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Move - - - - - Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Store - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Store - - - - Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] -MEM[mem_addr+511:mem_addr] := a[511:0] +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Store - - - - Store 512-bits of integer data from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". -MEM[mem_addr+511:mem_addr] := a[511:0] +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Load - - - Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + [round_note] -dst[511:0] := MEM[mem_addr+511:mem_addr] +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Load - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Move - - - - - Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Store - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Store - - - - Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] -MEM[mem_addr+511:mem_addr] := a[511:0] +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. FOR j := 0 to 7 @@ -108676,19 +101708,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_note] @@ -108702,16 +101733,15 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 7 @@ -108720,17 +101750,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". [round_note] @@ -108740,18 +101769,17 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. FOR j := 0 to 15 @@ -108764,19 +101792,18 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - - - + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_note] @@ -108790,16 +101817,15 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 15 @@ -108808,17 +101834,16 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Arithmetic - - - - + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". [round_note] @@ -108828,16 +101853,15 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Arithmetic - - - + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". FOR j := 0 to 15 @@ -108846,18 +101870,17 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Arithmetic - - - - - + + + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 @@ -108870,1279 +101893,1274 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst". + Arithmetic + + + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - dst[i+31:i] := a[i+31:i] AND b[i+31:i] + IF k[j] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise AND of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[511:0] := (a[511:0] AND b[511:0]) -dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst". + Arithmetic + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise NOT of 512 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - -dst[511:0] := ((NOT a[511:0]) AND b[511:0]) -dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] - FI + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in "a" and then AND with "b", and store the results in "dst". + Arithmetic + + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". -dst[511:0] := ((NOT a[511:0]) AND b[511:0]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in "a" and "b", and store the results in "dst". - -dst[511:0] := (a[511:0] AND b[511:0]) -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Swizzle - - - - - Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + Arithmetic + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Swizzle - - - - - Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + Arithmetic + + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + [round_note] FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI + dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + Arithmetic + + + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + Arithmetic + + + + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] FOR j := 0 to 15 i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + Arithmetic + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 15 i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + Arithmetic + + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + [round_note] FOR j := 0 to 15 i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + Arithmetic + + + + + + Reduce the packed 32-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[31:0] := REDUCE_ADD(tmp, 16) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + Reduce the packed 64-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[63:0] := REDUCE_ADD(tmp, 8) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[63:0] := REDUCE_ADD(tmp, 8) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". -FOR j := 0 to 15 +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[31:0] := REDUCE_ADD(tmp, 16) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + Reduce the packed 32-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". -FOR j := 0 to 15 +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 1 FI ENDFOR -k[MAX:16] := 0 +dst[31:0] := REDUCE_MUL(tmp, 16) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + Reduce the packed 64-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 1 FI ENDFOR -k[MAX:16] := 0 - - -
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 +dst[63:0] := REDUCE_MUL(tmp, 8) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + Arithmetic + + + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 1.0 + FI ENDFOR -k[MAX:16] := 0 +dst[63:0] := REDUCE_MUL(tmp, 8) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + Arithmetic + + + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". -FOR j := 0 to 15 +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := FP32(1.0) + FI ENDFOR -k[MAX:16] := 0 +dst[31:0] := REDUCE_MUL(tmp, 16) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + Arithmetic + + + + + Reduce the packed 32-bit integers in "a" by addition. Returns the sum of all elements in "a". -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_ADD(a, 16) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + Arithmetic + + + + + Reduce the packed 64-bit integers in "a" by addition. Returns the sum of all elements in "a". -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_ADD(a, 8) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + Arithmetic + + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_ADD(a, 8) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + Arithmetic + + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_ADD(a, 16) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + Arithmetic + + + + + Reduce the packed 32-bit integers in "a" by multiplication. Returns the product of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] FI -ENDFOR -k[MAX:16] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MUL(a, 16) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + Reduce the packed 64-bit integers in "a" by multiplication. Returns the product of all elements in "a". -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] FI -ENDFOR -k[MAX:16] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MUL(a, 8) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] FI -ENDFOR -k[MAX:16] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MUL(a, 8) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] FI -ENDFOR -k[MAX:16] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MUL(a, 16) - + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst". FOR j := 0 to 15 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI + dst[i+31:i] := ABS(v2[i+31:i]) ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 + IF k[j] + dst[i+31:i] := ABS(v2[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Compare - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst". -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ABS(v2[i+63:i]) ENDFOR -k[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Swizzle - - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_mask_permutexvar_epi32", and it is recommended that you use that intrinsic name. + Arithmetic + + + + + + + Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := a[id+31:id] + dst[i+63:i] := ABS(v2[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Swizzle - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_permutexvar_epi32", and it is recommended that you use that intrinsic name. + Arithmetic + + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst". -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) +dst[511:0] := temp[511:0] dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Load - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + Miscellaneous + + + + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) FOR j := 0 to 15 i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + IF k[j] + dst[i+31:i] := temp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Load - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 + Miscellaneous + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 + Miscellaneous + + + + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 15 + Miscellaneous + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 + Miscellaneous + + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 15 + Miscellaneous + + + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 + Miscellaneous + + + + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + Miscellaneous + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 + Miscellaneous + + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 + Miscellaneous + + + + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 15 i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 + Miscellaneous + + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 15 i := j*32 IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". - -FOR j := 0 to 15 + Miscellaneous + + + + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 15 i := j*32 - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Miscellaneous + + + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] + dst[i+63:i] := b[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". + Swizzle + + + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise OR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[511:0] := (a[511:0] OR b[511:0]) -dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Swizzle + + + + + + + Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + dst[i+31:i] := b[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the resut in "dst". + Swizzle + + + + + + + Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". FOR j := 0 to 7 i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] + IF k[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Store - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + Swizzle + + + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_mask_permutexvar_epi32", and it is recommended that you use that intrinsic name. FOR j := 0 to 15 i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + id := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Store - - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + Swizzle + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_permutexvar_epi32", and it is recommended that you use that intrinsic name. FOR j := 0 to 15 i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI + id := idx[i+3:i]*32 + dst[i+31:i] := a[id+31:id] ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Swizzle - - - - - + + + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). DEFINE SELECT4(src, control) { @@ -110180,16 +103198,15 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Swizzle - - - + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". DEFINE SELECT4(src, control) { @@ -110219,1780 +103236,1865 @@ dst[479:448] := SELECT4(a[511:384], imm8[5:4]) dst[511:480] := SELECT4(a[511:384], imm8[7:6]) dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI + Swizzle + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI + Compare + + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 7 + i := j*64 + k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". -FOR j := 0 to 15 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI +FOR j := 0 to 7 + i := j*64 + k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". + FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". + FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Shift - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + Compare + + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE - dst[i+31:i] := 0 + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI + Compare + + + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR -dst[MAX:512] := 0 +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Logical - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 - ELSE + k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 + ELSE k[j] := 0 FI ENDFOR -k[MAX:16] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512F/KNCNI - Logical - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -k[MAX:16] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise XOR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[511:0] := (a[511:0] XOR b[511:0]) -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] + IF k1[j] + k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + IF k1[j] + k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 15 i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI + k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 ENDFOR +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI + Compare + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - [round_note] + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] + k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - [round_note] - -FOR j := 0 to 15 + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". + FOR j := 0 to 15 i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] + k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Cast - - - Cast vector of type __m512d to type __m512. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - Integer - AVX512F/KNCNI - Cast - - - Cast vector of type __m512d to type __m512i. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Cast - - - Cast vector of type __m512 to type __m512d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - Integer - AVX512F/KNCNI - Cast - - - Cast vector of type __m512 to type __m512i. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Cast - - - Cast vector of type __m512i to type __m512d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Cast - - - Cast vector of type __m512i to type __m512. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - - Reduce the packed 32-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -dst[31:0] := 0 -FOR j := 0 to 15 + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". + FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := dst[31:0] + a[i+31:i] - FI + k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - - Reduce the packed 64-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -dst[63:0] := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := dst[63:0] + a[i+63:i] + Compare + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -dst[63:0] := 0.0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := dst[63:0] + a[i+63:i] + Compare + + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := 0.0 FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := dst[31:0] + a[i+31:i] + IF k1[j] + k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 FI -ENDFOR +ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Reduce the packed 32-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := 0xFFFFFFFF FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := dst[31:0] AND a[i+31:i] + IF k1[j] + k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Reduce the packed 64-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := 0xFFFFFFFFFFFFFFFF -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := dst[63:0] AND a[i+63:i] +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Int32(-0x80000000) FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := (dst[31:0] > a[i+31:i] ? dst[31:0] : a[i+31:i]) + IF k1[j] + k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := Int64(-0x8000000000000000) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := (dst[63:0] > a[i+63:i] ? dst[63:0] : a[i+63:i]) +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := 0 FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := (dst[31:0] > a[i+31:i] ? dst[31:0] : a[i+31:i]) + IF k1[j] + k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed unsigned 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -dst[63:0] := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := (dst[63:0] > a[i+63:i] ? dst[63:0] : a[i+63:i]) + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -dst[63:0] := Cast_FP64(0xFFEFFFFFFFFFFFFF) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := (dst[63:0] > a[i+63:i] ? dst[63:0] : a[i+63:i]) + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -dst[31:0] := Cast_FP32(0xFF7FFFFF) + Compare + + + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := (dst[31:0] > a[i+31:i] ? dst[31:0] : a[i+31:i]) - FI + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". + Compare + + + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -dst[31:0] := Int32(0x7FFFFFFF) FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := (dst[31:0] < a[i+31:i] ? dst[31:0] : a[i+31:i]) - FI + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". + Compare + + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -dst[63:0] := Int64(0x7FFFFFFFFFFFFFFF) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := (dst[63:0] < a[i+63:i] ? dst[63:0] : a[i+63:i]) - FI +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". + Compare + + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -dst[31:0] := 0xFFFFFFFF FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := (dst[31:0] < a[i+31:i] ? dst[31:0] : a[i+31:i]) - FI + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed unsigned 64-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + Compare + + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -dst[63:0] := 0xFFFFFFFFFFFFFFFF -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := (dst[63:0] < a[i+63:i] ? dst[63:0] : a[i+63:i]) - FI +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". + Compare + + + + + + Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -dst[63:0] := Cast_FP64(0x7FEFFFFFFFFFFFFF) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := (dst[63:0] < a[i+63:i] ? dst[63:0] : a[i+63:i]) - FI +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Special Math Functions - - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". - -dst[31:0] := Cast_FP32(0x7F7FFFFF) + Compare + + + + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := (dst[31:0] < a[i+31:i] ? dst[31:0] : a[i+31:i]) + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - - Reduce the packed 32-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + Compare + + + + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := 1 FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := dst[31:0] * a[i+31:i] + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - - Reduce the packed 64-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + Compare + + + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := 1 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := dst[63:0] * a[i+63:i] +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + Compare + + + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := 1.0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := dst[63:0] * a[i+63:i] +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + Compare + + + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := FP32(1.0) FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := dst[31:0] * a[i+31:i] + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Reduce the packed 32-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". + Compare + + + + + + + Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := 0 FOR j := 0 to 15 i := j*32 - IF k[j] - dst[31:0] := dst[31:0] OR a[i+31:i] + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - Reduce the packed 64-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". - -dst[63:0] := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[63:0] := dst[63:0] OR a[i+63:i] - FI + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - Reduce the packed 32-bit integers in "a" by addition. Returns the sum of all elements in "a". + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". -dst[31:0] := 0 FOR j := 0 to 15 i := j*32 - dst[31:0] := dst[31:0] + a[i+31:i] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - Reduce the packed 64-bit integers in "a" by addition. Returns the sum of all elements in "a". + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". -dst[63:0] := 0 -FOR j := 0 to 7 - i := j*64 - dst[63:0] := dst[63:0] + a[i+63:i] +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". -dst[63:0] := 0.0 -FOR j := 0 to 7 - i := j*64 - dst[63:0] := dst[63:0] + a[i+63:i] +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -dst[31:0] := 0.0 FOR j := 0 to 15 i := j*32 - dst[31:0] := dst[31:0] + a[i+31:i] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - Reduce the packed 32-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". -dst[31:0] := 0xFFFFFFFF FOR j := 0 to 15 i := j*32 - dst[31:0] := dst[31:0] AND a[i+31:i] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - Reduce the packed 64-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -dst[63:0] := 0xFFFFFFFFFFFFFFFF -FOR j := 0 to 7 - i := j*64 - dst[63:0] := dst[63:0] AND a[i+63:i] +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed signed 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". - -dst[31:0] := Int32(-0x80000000) + Compare + + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 15 i := j*32 - dst[31:0] := (dst[31:0] > a[i+31:i] ? dst[31:0] : a[i+31:i]) + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed signed 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := Int64(-0x8000000000000000) -FOR j := 0 to 7 - i := j*64 - dst[63:0] := (dst[63:0] > a[i+63:i] ? dst[63:0] : a[i+63:i]) -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed unsigned 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". - -dst[31:0] := 0 FOR j := 0 to 15 i := j*32 - dst[31:0] := (dst[31:0] > a[i+31:i] ? dst[31:0] : a[i+31:i]) -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed unsigned 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". - -dst[63:0] := 0 -FOR j := 0 to 7 - i := j*64 - dst[63:0] := (dst[63:0] > a[i+63:i] ? dst[63:0] : a[i+63:i]) -ENDFOR - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". - -dst[63:0] := Cast_FP64(0xFFEFFFFFFFFFFFFF) -FOR j := 0 to 7 - i := j*64 - dst[63:0] := (dst[63:0] > a[i+63:i] ? dst[63:0] : a[i+63:i]) + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Cast_FP32(0xFF7FFFFF) FOR j := 0 to 15 i := j*32 - dst[31:0] := (dst[31:0] > a[i+31:i] ? dst[31:0] : a[i+31:i]) + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed signed 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := Int32(0x7FFFFFFF) FOR j := 0 to 15 i := j*32 - dst[31:0] := (dst[31:0] < a[i+31:i] ? dst[31:0] : a[i+31:i]) -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed signed 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". - -dst[63:0] := Int64(0x7FFFFFFFFFFFFFFF) -FOR j := 0 to 7 - i := j*64 - dst[63:0] := (dst[63:0] < a[i+63:i] ? dst[63:0] : a[i+63:i]) + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed unsigned 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[31:0] := 0xFFFFFFFF FOR j := 0 to 15 i := j*32 - dst[31:0] := (dst[31:0] < a[i+31:i] ? dst[31:0] : a[i+31:i]) + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed unsigned 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := 0xFFFFFFFFFFFFFFFF -FOR j := 0 to 7 - i := j*64 - dst[63:0] := (dst[63:0] < a[i+63:i] ? dst[63:0] : a[i+63:i]) +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := Cast_FP64(0x7FEFFFFFFFFFFFFF) -FOR j := 0 to 7 - i := j*64 - dst[63:0] := (dst[63:0] < a[i+63:i] ? dst[63:0] : a[i+63:i]) +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Special Math Functions - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". + Compare + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -dst[31:0] := Cast_FP32(0x7F7FFFFF) FOR j := 0 to 15 i := j*32 - dst[31:0] := (dst[31:0] < a[i+31:i] ? dst[31:0] : a[i+31:i]) + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ENDFOR +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - Reduce the packed 32-bit integers in "a" by multiplication. Returns the product of all elements in "a". + Load + + + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -dst[31:0] := 1 FOR j := 0 to 15 i := j*32 - dst[31:0] := dst[31:0] * a[i+31:i] + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Arithmetic - - - Reduce the packed 64-bit integers in "a" by multiplication. Returns the product of all elements in "a". + Load + + + + + Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -dst[63:0] := 1 -FOR j := 0 to 7 - i := j*64 - dst[63:0] := dst[63:0] * a[i+63:i] -ENDFOR +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". + Load + + + + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -dst[63:0] := 1.0 FOR j := 0 to 7 i := j*64 - dst[63:0] := dst[63:0] * a[i+63:i] -ENDFOR - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". - -dst[31:0] := FP32(1.0) -FOR j := 0 to 15 - i := j*32 - dst[31:0] := dst[31:0] * a[i+31:i] -ENDFOR - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - Reduce the packed 32-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". - -dst[31:0] := 0 -FOR j := 0 to 15 - i := j*32 - dst[31:0] := dst[31:0] OR a[i+31:i] + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - Reduce the packed 64-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". + Load + + + + + Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -dst[63:0] := 0 -FOR j := 0 to 7 - i := j*64 - dst[63:0] := dst[63:0] OR a[i+63:i] -ENDFOR +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI - Logical - - - - - - Performs element-by-element bitwise AND between packed 32-bit integer elements of "v2" and "v3", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Load + + + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := v2[i+31:i] & v3[i+31:i] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Convert - - - Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". + Load + + + + + Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*32 - n := j*64 - dst[n+63:n] := Convert_FP32_To_FP64(v2[i+31:i]) -ENDFOR +dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Convert - - - - - Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Load + + + + + Load 512-bits of integer data from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 15 i := j*32 - l := j*64 IF k[j] - dst[l+63:l] := Convert_FP32_To_FP64(v2[i+31:i]) + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] ELSE - dst[l+63:l] := src[l+63:l] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512F/KNCNI - Convert - - - Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". + Load + + + + + Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*32 - l := j*64 - dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) -ENDFOR +dst[511:0] := MEM[mem_addr+511:mem_addr] dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512F/KNCNI - Convert - - - - - Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Load + + + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. FOR j := 0 to 7 - i := j*32 - n := j*64 + i := j*64 IF k[j] - dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE - dst[n+63:n] := src[n+63:n] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512F/KNCNI - Convert - - - Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". + Load + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - n := j*64 - dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - AVX512F/KNCNI - Convert - - - - - Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Load + + + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - l := j*64 + m := j*32 IF k[j] - dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst[l+63:l] := src[l+63:l] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Load - - - - - - + + + + + + + + Up-converts 16 memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst". AVX512 only supports _MM_UPCONV_EPI32_NONE. FOR j := 0 to 15 @@ -112010,21 +105112,20 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Load - - - - - - - - + + + + + + + + + + Up-converts 16 single-precision (32-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). AVX512 only supports _MM_UPCONV_EPI32_NONE. FOR j := 0 to 15 @@ -112046,19 +105147,18 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Load - - - - - - + + + + + + + + Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst". FOR j := 0 to 7 @@ -112072,21 +105172,20 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Load - - - - - - - - + + + + + + + + + + Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -112104,19 +105203,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Load - - - - - - + + + + + + + + Up-converts 16 memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in "dst". AVX512 only supports _MM_UPCONV_PS_NONE. FOR j := 0 to 15 @@ -112135,21 +105233,20 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Load - - - - - - - - + + + + + + + + + + Up-converts 16 single-precision (32-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). AVX512 only supports _MM_UPCONV_PS_NONE. FOR j := 0 to 15 @@ -112172,19 +105269,18 @@ FOR j := 0 to 15 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Load - - - - - - + + + + + + + + Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst". FOR j := 0 to 7 @@ -112198,21 +105294,20 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Load - - - - - - - - + + + + + + + + + + Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -112231,234 +105326,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - - - - Down-converts 16 packed single-precision (32-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". AVX512 only supports _MM_DOWNCONV_PS_NONE. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - - CASE conv OF - _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] - _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) - _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) - _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) - _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) - _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) - ESAC -ENDFOR - - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - - - - - Down-converts 16 packed single-precision (32-bit) floating-point elements in "a" according to "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are written only when the corresponding mask bit is not set). AVX512 only supports _MM_DOWNCONV_PS_NONE. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - - IF k[j] - CASE conv OF - _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] - _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) - _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) - _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) - _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) - _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) - ESAC - FI -ENDFOR - - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - - - - Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - - CASE conv OF - _MM_DOWNCONV_PD_NONE: MEM[addr+63:addr] := a[i+63:i] - ESAC -ENDFOR - - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - - - - - Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - - IF k[j] - CASE conv OF - _MM_DOWNCONV_PD_NONE: MEM[addr+63:addr] := a[i+63:i] - ESAC - FI -ENDFOR - - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Store - - - - - - - - Down-converts 8 packed 64-bit integer elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - - CASE conv OF - _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] - ESAC -ENDFOR - - -
immintrin.h
-
- - Integer - AVX512F/KNCNI - Store - - - - - - - - - Down-converts 8 packed 64-bit integer elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - - IF k[j] - CASE conv OF - _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] - ESAC - FI -ENDFOR - - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Convert - - - Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst". The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. - -FOR j := 0 to 7 - i := j*64 - k := j*32 - dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Convert - - - - - Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Load - - - - + + + + + + Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst". FOR j := 0 to 7 @@ -112469,19 +105346,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Load - - - - - - + + + + + + + + Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -112496,17 +105372,16 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Load - - - - + + + + + + Loads 8 double-precision (64-bit) floating-point elements stored at memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" them in "dst". FOR j := 0 to 7 @@ -112517,19 +105392,18 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI Load - - - - - - + + + + + + + + Loads 8 double-precision (64-bit) floating-point elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 @@ -112544,18809 +105418,45658 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - - Stores 8 packed double-precision (64-bit) floating-point elements in "a" and to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Store - - - - - - - Stores 8 packed double-precision (64-bit) floating-point elements in "a" to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. + Load + + + + + + + Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst". + Move + + + + + + + Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ABS(v2[i+31:i]) + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Move + + + + + + + Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ABS(v2[i+31:i]) + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst". + Move + + + + + + + Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - dst[i+63:i] := ABS(v2[i+63:i]) + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Arithmetic - - - - - Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Move + + + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := ABS(v2[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Elementary Math Functions - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + Store + + + + + + Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) -ENDFOR -dst[MAX:512] := 0 +MEM[mem_addr+511:mem_addr] := a[511:0] - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512F/KNCNI - Elementary Math Functions - - - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Store + + + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) - ELSE - dst[i+31:i] := src[i+31:i] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR -dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Store + + - - - - - - - Down-converts 16 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. AVX512 only supports _MM_DOWNCONV_EPI32_NONE. + + + Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - - CASE conv OF - _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] - _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) - _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) - _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) - _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+15:i]) - ESAC -ENDFOR +MEM[mem_addr+511:mem_addr] := a[511:0] - + + AVX512F
immintrin.h
-
- - Integer - AVX512F/KNCNI Store + + - - - - - - - - Down-converts 16 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Elements are written using writemask "k" (elements are only written when the corresponding mask bit is set; otherwise, elements are left unchanged in memory). "hint" indicates to the processor whether the data is non-temporal. AVX512 only supports _MM_DOWNCONV_EPI32_NONE. + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. FOR j := 0 to 15 i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - - IF k[j] - CASE conv OF - _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] - _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) - _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) - _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) - _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+15:i]) - ESAC - FI -ENDFOR - - -
immintrin.h
-
- - AVX512IFMA52 - Arithmetic - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := 0 + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] FI ENDFOR -dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - AVX512IFMA52 - Arithmetic - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". + Store + + + + + + Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:512] := 0 +MEM[mem_addr+511:mem_addr] := a[511:0] - + + AVX512F
immintrin.h
-
- - AVX512IFMA52 - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Store + + + + + + Store 512-bits of integer data from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 +MEM[mem_addr+511:mem_addr] := a[511:0] - + + AVX512F
immintrin.h
-
- - AVX512IFMA52 - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Store + + + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. FOR j := 0 to 7 i := j*64 IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := a[i+63:i] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - AVX512IFMA52 - AVX512VL - Arithmetic - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Store + + + + + + Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 +MEM[mem_addr+511:mem_addr] := a[511:0] - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF - Load + Store + + - - - - - Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache. "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -FOR j:= 0 to 7 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF - Load + Store + + - - - - - - Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are only brought into cache when their corresponding mask bit is set). "scale" should be 1, 2, 4 or 8.. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j:= 0 to 7 - i := j*64 - m := j*64 +FOR j := 0 to 15 + i := j*32 + m := j*32 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF Store + + - - - - - Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. Elements are prefetched into cache level "hint", where "hint" is 0 or 1. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF Store + + - - - - - - Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 7 - i := j*64 - m := j*64 +FOR j := 0 to 15 + i := j*32 + m := j*32 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] FI ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF - Load + Store + + - - - - - Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache. "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + + + + + + + Down-converts 16 packed single-precision (32-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". AVX512 only supports _MM_DOWNCONV_PS_NONE. -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 m := j*32 addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+63:addr], hint) + + CASE conv OF + _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) + _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) + _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) + _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) + _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) + ESAC ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF - Load + Store + + - - - - - - Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + + + + + + + + Down-converts 16 packed single-precision (32-bit) floating-point elements in "a" according to "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are written only when the corresponding mask bit is not set). AVX512 only supports _MM_DOWNCONV_PS_NONE. -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+63:addr], hint) + CASE conv OF + _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) + _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) + _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) + _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) + _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) + ESAC FI ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF Store + + - - - - - Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + + + + + + + Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". FOR j := 0 to 7 i := j*64 m := j*32 addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+63:addr], hint) + + CASE conv OF + _MM_DOWNCONV_PD_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF Store + + - - - - - - Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + + + + + + + + Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. FOR j := 0 to 7 i := j*64 m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+63:addr], hint) + CASE conv OF + _MM_DOWNCONV_PD_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC FI ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF - Load + Store + + - - - - - Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by "hint" using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + + + + + + + Down-converts 8 packed 64-bit integer elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". FOR j := 0 to 7 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+63:addr], hint) + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF - Load + Store + + - - - - - - Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by "hint" using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Prefetched elements are merged in cache using writemask "k" (elements are copied from memory when the corresponding mask bit is set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + + + + + + + + Down-converts 8 packed 64-bit integer elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. FOR j := 0 to 7 i := j*64 - m := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+63:addr], hint) + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC FI ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF Store + + - - - - - Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + + + + + Stores 8 packed double-precision (64-bit) floating-point elements in "a" and to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". FOR j := 0 to 7 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+63:addr], hint) + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF Store + + - - - - - - Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + + + + + + Stores 8 packed double-precision (64-bit) floating-point elements in "a" to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. FOR j := 0 to 7 i := j*64 - m := j*64 + m := j*32 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+63:addr], hint) + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF/KNCNI - Load + Store + + - - - - - - Prefetch single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + + + + + + + Down-converts 16 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. AVX512 only supports _MM_DOWNCONV_EPI32_NONE. FOR j := 0 to 15 i := j*32 m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) - FI + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+15:i]) + ESAC ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF/KNCNI - Load + Store + + - - - - - - Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. + + + + + + + + Down-converts 16 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Elements are written using writemask "k" (elements are only written when the corresponding mask bit is set; otherwise, elements are left unchanged in memory). "hint" indicates to the processor whether the data is non-temporal. AVX512 only supports _MM_DOWNCONV_EPI32_NONE. FOR j := 0 to 15 i := j*32 m := j*32 addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) + + IF k[j] + CASE conv OF + _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+15:i]) + ESAC + FI ENDFOR - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF/KNCNI - Load - - - - - - - - Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. + Store + + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst". FOR j := 0 to 15 i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) - FI -ENDFOR - - - -
immintrin.h
-
- - Floating Point - AVX512PF/KNCNI - Store - - - - - - - Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint", with a request for exclusive ownership. The "hint" parameter may be one of the following: _MM_HINT_T0 = 1 for prefetching to L1 cache, _MM_HINT_T1 = 2 for prefetching to L2 cache, _MM_HINT_T2 = 3 for prefetching to L2 cache non-temporal, _MM_HINT_NTA = 0 for prefetching to L1 cache non-temporal. The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent scatter intrinsic. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) + dst[i+31:i] := a[i+31:i] AND b[i+31:i] ENDFOR +dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF/KNCNI - Store - - - - - - - - Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. Only those elements whose corresponding mask bit in "k" is set are loaded into cache. + Logical + + + + + + Compute the bitwise AND of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". -cachev := 0 -FOR j := 0 to 15 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) - FI -ENDFOR +dst[511:0] := (a[511:0] AND b[511:0]) +dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF/KNCNI - Load - - - - - - Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + Logical + + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst". FOR j := 0 to 15 i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) + dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ENDFOR +dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF/KNCNI - Store - - - - - - Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + Logical + + + + + + Compute the bitwise NOT of 512 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) -ENDFOR +dst[511:0] := ((NOT a[511:0]) AND b[511:0]) +dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512PF/KNCNI - Store - - - - - - - Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. Only those elements whose corresponding mask bit in "k" is set are loaded into the desired cache. + Logical + + + + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - m := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - Prefetch(MEM[addr+31:addr], hint) + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + + Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in "a" and then AND with "b", and store the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 +dst[511:0] := ((NOT a[511:0]) AND b[511:0]) +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". + Logical + + + + + + Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in "a" and "b", and store the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POPCNT(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 +dst[511:0] := (a[511:0] AND b[511:0]) +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 1 +FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) + dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". + Logical + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := POPCNT(a[i+63:i]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". + Logical + + + + + + Compute the bitwise OR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POPCNT(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 +dst[511:0] := (a[511:0] OR b[511:0]) +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) + dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the resut in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". + Logical + + + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := POPCNT(a[i+31:i]) + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ENDFOR -dst[MAX:128] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - AVX512VL - Bit Manipulation - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - Bit Manipulation - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". + Logical + + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} FOR j := 0 to 15 i := j*32 - dst[i+31:i] := POPCNT(a[i+31:i]) + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + Compute the bitwise XOR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR +dst[511:0] := (a[511:0] XOR b[511:0]) dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - Bit Manipulation - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - Bit Manipulation - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". + Logical + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} FOR j := 0 to 7 i := j*64 - dst[i+63:i] := POPCNT(a[i+63:i]) + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Logical + + + + + + Reduce the packed 32-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[31:0] AND src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_AND(src[32*len-1:0], len) } -FOR j := 0 to 7 - i := j*64 +tmp := a +FOR j := 0 to 16 + i := j*32 IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) + tmp[i+31:i] := a[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + tmp[i+31:i] := 0xFFFFFFFF FI ENDFOR -dst[MAX:512] := 0 +dst[31:0] := REDUCE_AND(tmp, 16) - + AVX512F
immintrin.h
-
- - Integer - AVX512VPOPCNTDQ - Bit Manipulation - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + + Reduce the packed 64-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[63:0] AND src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_AND(src[64*len-1:0], len) } -FOR j := 0 to 7 +tmp := a +FOR j := 0 to 8 i := j*64 IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) + tmp[i+63:i] := a[i+63:i] ELSE - dst[i+63:i] := 0 + tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF FI ENDFOR -dst[MAX:512] := 0 +dst[63:0] := REDUCE_AND(tmp, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate with the corresponding elements in "src", and store the results in "dst". + Logical + + + + + + Reduce the packed 32-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". -dst[511:0] := src[511:0] -FOR i := 0 to 15 - FOR m := 0 to 3 - addr := b + m * 32 - dst.fp32[i] := dst.fp32[i] + a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[31:0] OR src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len] ENDFOR + RETURN REDUCE_OR(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[31:0] := REDUCE_OR(tmp, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - - Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate with the corresponding elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Logical + + + + + + Reduce the packed 64-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". -dst[511:0] := src[511:0] -FOR i := 0 to 15 - FOR m := 0 to 3 - addr := b + m * 32 - IF k[i] - dst.fp32[i] := dst.fp32[i] + a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) - FI +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[63:0] OR src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len] ENDFOR + RETURN REDUCE_OR(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[63:0] := REDUCE_OR(tmp, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - - Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate with the corresponding elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + Reduce the packed 32-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". -dst[511:0] := src[511:0] -FOR i := 0 to 15 - FOR m := 0 to 3 - addr := b + m * 32 - IF k[i] - dst.fp32[i] := dst.fp32[i] + a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) - ELSE - dst.fp32[i] := 0 - FI +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[31:0] AND src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len] ENDFOR -ENDFOR -dst[MAX:512] := 0 + RETURN REDUCE_AND(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_AND(a, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate the negated intermediate result with the corresponding elements in "src", and store the results in "dst". + Logical + + + + + Reduce the packed 64-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". -dst[511:0] := src[511:0] -FOR i := 0 to 15 - FOR m := 0 to 3 - addr := b + m * 32 - dst.fp32[i] := dst.fp32[i] - a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[63:0] AND src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len] ENDFOR -ENDFOR -dst[MAX:512] := 0 + RETURN REDUCE_AND(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_AND(a, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - - Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate the negated intermediate result with the corresponding elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Logical + + + + + Reduce the packed 32-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". -dst[511:0] := src[511:0] -FOR i := 0 to 15 - FOR m := 0 to 3 - addr := b + m * 32 - IF k[i] - dst.fp32[i] := dst.fp32[i] - a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) - FI +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[31:0] OR src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len] ENDFOR -ENDFOR -dst[MAX:512] := 0 + RETURN REDUCE_OR(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_OR(a, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - - Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate the negated intermediate result with the corresponding elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Logical + + + + + Reduce the packed 64-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". -dst[511:0] := src[511:0] -FOR i := 0 to 15 - FOR m := 0 to 3 - addr := b + m * 32 - IF k[i] - dst.fp32[i] := dst.fp32[i] - a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) - ELSE - dst.fp32[i] := 0 - FI +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[63:0] OR src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len] ENDFOR -ENDFOR -dst[MAX:512] := 0 + RETURN REDUCE_OR(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_OR(a, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate with the lower element in "a", and store the result in the lower element of "dst". + Logical + + + + + + + + Performs element-by-element bitwise AND between packed 32-bit integer elements of "v2" and "v3", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[127:0] := src[127:0] -FOR m := 0 to 3 - addr := b + m * 32 - dst.fp32[0] := dst.fp32[0] + a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v2[i+31:i] & v3[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate with the lower element in "a", and store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set). - -dst[127:0] := src[127:0] -IF k[0] - FOR m := 0 to 3 - addr := b + m * 32 - dst.fp32[0] := dst.fp32[0] + a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) - ENDFOR -FI -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate with the lower element in "a", and store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set). - -dst[127:0] := src[127:0] -IF k[0] - FOR m := 0 to 3 - addr := b + m * 32 - dst.fp32[0] := dst.fp32[0] + a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) - ENDFOR -ELSE - dst.fp32[0] := 0 -FI -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate the negated intermediate result with the lower element in "src", and store the result in the lower element of "dst". + Logical + + + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[127:0] := src[127:0] -FOR m := 0 to 3 - addr := b + m * 32 - dst.fp32[0] := dst.fp32[0] - a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:128] := 0 - - -
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate the negated intermediate result with the lower element in "src", and store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set). - -dst[127:0] := src[127:0] -IF k[0] - FOR m := 0 to 3 - addr := b + m * 32 - dst.fp32[0] := dst.fp32[0] - a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) - ENDFOR -FI -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512_4FMAPS - Arithmetic - - - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate the negated intermediate result with the lower element in "src", and store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set). + Special Math Functions + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". -dst[127:0] := src[127:0] -IF k[0] - FOR m := 0 to 3 - addr := b + m * 32 - dst.fp32[0] := dst.fp32[0] - a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) - ENDFOR -ELSE - dst.fp32[0] := 0 -FI -dst[MAX:128] := 0 +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_4VNNIW - Arithmetic - - - - - - - - Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation, and store the results in "dst". + Special Math Functions + + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[511:0] := src[511:0] -FOR i := 0 to 15 - FOR m := 0 to 3 - lim_base := b + m*32 - t.dword := MEM[lim_base+31:lim_base] - p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) - p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) - dst.dword[i] := dst.dword[i] + p1.dword + p2.dword - ENDFOR +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_4VNNIW - Arithmetic - - - - - - - - - Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". -dst[511:0] := src[511:0] -FOR i := 0 to 15 - IF k[i] - FOR m := 0 to 3 - lim_base := b + m*32 - t.dword := MEM[lim_base+31:lim_base] - p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) - p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) - dst.dword[i] := dst.dword[i] + p1.dword + p2.dword - ENDFOR - ELSE - dst.dword[i] := src.dword[i] - FI +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_4VNNIW - Arithmetic - - - - - - - - - Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Special Math Functions + + + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[511:0] := src[511:0] -FOR i := 0 to 15 - IF k[i] - FOR m := 0 to 3 - lim_base := b + m*32 - t.dword := MEM[lim_base+31:lim_base] - p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) - p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) - dst.dword[i] := dst.dword[i] + p1.dword + p2.dword - ENDFOR +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst.dword[i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_4VNNIW - Arithmetic - - - - - - - - Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation and signed saturation, and store the results in "dst". + Special Math Functions + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". -dst[511:0] := src[511:0] -FOR i := 0 to 15 - FOR m := 0 to 3 - lim_base := b + m*32 - t.dword := MEM[lim_base+31:lim_base] - p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) - p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) - dst.dword[i] := Saturate32(dst.dword[i] + p1.dword + p2.dword) - ENDFOR +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_4VNNIW - Arithmetic - - - - - - - - - Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask and signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).. + Special Math Functions + + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[511:0] := src[511:0] -FOR i := 0 to 15 - IF k[i] - FOR m := 0 to 3 - lim_base := b + m*32 - t.dword := MEM[lim_base+31:lim_base] - p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) - p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) - dst.dword[i] := Saturate32(dst.dword[i] + p1.dword + p2.dword) - ENDFOR +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE - dst.dword[i] := src.dword[i] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_4VNNIW - Arithmetic - - - - - - - - - Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask and signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).. + Special Math Functions + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". -dst[511:0] := src[511:0] -FOR i := 0 to 15 - IF k[i] - FOR m := 0 to 3 - lim_base := b + m*32 - t.dword := MEM[lim_base+31:lim_base] - p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) - p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) - dst.dword[i] := Saturate32(dst.dword[i] + p1.dword + p2.dword) - ENDFOR - ELSE - dst.dword[i] := 0 - FI +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". + Special Math Functions + + + + + + Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 7 - IF j < 4 - t := b.fp32[j] +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] ELSE - t := a.fp32[j-4] + tmp[i+31:i] := Int32(-0x80000000) FI - dst.word[j] := Convert_FP32_To_BF16(t) ENDFOR -dst[MAX:128] := 0 +dst[31:0] := REDUCE_MAX(tmp, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 7 +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 IF k[j] - IF j < 4 - t := b.fp32[j] - ELSE - t := a.fp32[j-4] - FI - dst.word[j] := Convert_FP32_To_BF16(t) + tmp[i+63:i] := a[i+63:i] ELSE - dst.word[j] := src.word[j] + tmp[i+63:i] := Int64(-0x8000000000000000) FI ENDFOR -dst[MAX:128] := 0 +dst[63:0] := REDUCE_MAX(tmp, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Special Math Functions + + + + + + Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 7 +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 IF k[j] - IF j < 4 - t := b.fp32[j] - ELSE - t := a.fp32[j-4] - FI - dst.word[j] := Convert_FP32_To_BF16(t) + tmp[i+31:i] := a[i+31:i] ELSE - dst.word[j] := 0 + tmp[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[31:0] := REDUCE_MAX(tmp, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". + Special Math Functions + + + + + + Reduce the packed unsigned 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 15 - IF j < 8 - t := b.fp32[j] +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] ELSE - t := a.fp32[j-8] + tmp[i+63:i] := 0 FI - dst.word[j] := Convert_FP32_To_BF16(t) ENDFOR -dst[MAX:256] := 0 +dst[63:0] := REDUCE_MAX(tmp, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 15 +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 IF k[j] - IF j < 8 - t := b.fp32[j] - ELSE - t := a.fp32[j-8] - FI - dst.word[j] := Convert_FP32_To_BF16(t) + tmp[i+63:i] := a[i+63:i] ELSE - dst.word[j] := src.word[j] + tmp[i+63:i] := Cast_FP64(0xFFEFFFFFFFFFFFFF) FI ENDFOR -dst[MAX:256] := 0 +dst[63:0] := REDUCE_MAX(tmp, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Special Math Functions + + + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 15 +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 IF k[j] - IF j < 8 - t := b.fp32[j] - ELSE - t := a.fp32[j-8] - FI - dst.word[j] := Convert_FP32_To_BF16(t) + tmp[i+31:i] := a[i+31:i] ELSE - dst.word[j] := 0 + tmp[i+31:i] := Cast_FP32(0xFF7FFFFF) FI ENDFOR -dst[MAX:256] := 0 +dst[31:0] := REDUCE_MAX(tmp, 16) - -
immintrin.h
-
- - Floating Point - AVX512_BF16 AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". -FOR j := 0 to 31 - IF j < 16 - t := b.fp32[j] +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] ELSE - t := a.fp32[j-16] + tmp[i+31:i] := Int32(0x7FFFFFFF) FI - dst.word[j] := Convert_FP32_To_BF16(t) ENDFOR -dst[MAX:512] := 0 +dst[31:0] := REDUCE_MIN(tmp, 16) - -
immintrin.h
-
- - Floating Point - AVX512_BF16 AVX512F - Convert - - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". -FOR j := 0 to 31 +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 IF k[j] - IF j < 16 - t := b.fp32[j] - ELSE - t := a.fp32[j-16] - FI - dst.word[j] := Convert_FP32_To_BF16(t) + tmp[i+63:i] := a[i+63:i] ELSE - dst.word[j] := src.word[j] + tmp[i+63:i] := Int64(0x7FFFFFFFFFFFFFFF) FI ENDFOR -dst[MAX:512] := 0 +dst[63:0] := REDUCE_MIN(tmp, 8) - -
immintrin.h
-
- - Floating Point - AVX512_BF16 AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". -FOR j := 0 to 31 +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 IF k[j] - IF j < 16 - t := b.fp32[j] - ELSE - t := a.fp32[j-16] - FI - dst.word[j] := Convert_FP32_To_BF16(t) + tmp[i+31:i] := a[i+31:i] ELSE - dst.word[j] := 0 + tmp[i+31:i] := 0xFFFFFFFF FI ENDFOR -dst[MAX:512] := 0 +dst[31:0] := REDUCE_MIN(tmp, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". + Special Math Functions + + + + + + Reduce the packed unsigned 64-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". -FOR j := 0 to 3 - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF + FI ENDFOR -dst[MAX:128] := 0 +dst[63:0] := REDUCE_MIN(tmp, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 + Special Math Functions + + + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) + tmp[i+63:i] := a[i+63:i] ELSE - dst.word[j] := src.word[j] + tmp[i+63:i] := Cast_FP64(0x7FEFFFFFFFFFFFFF) FI ENDFOR -dst[MAX:128] := 0 +dst[63:0] := REDUCE_MIN(tmp, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 + Special Math Functions + + + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) + tmp[i+31:i] := a[i+31:i] ELSE - dst.word[j] := 0 + tmp[i+31:i] := Cast_FP32(0x7F7FFFFF) FI ENDFOR -dst[MAX:128] := 0 +dst[31:0] := REDUCE_MIN(tmp, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". + Special Math Functions + + + + + Reduce the packed signed 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 7 - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) -ENDFOR -dst[MAX:128] := 0 +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MAX(a, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + Reduce the packed signed 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 7 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := src.word[j] +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) FI -ENDFOR -dst[MAX:128] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MAX(a, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Special Math Functions + + + + + Reduce the packed unsigned 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 7 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := 0 +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) FI -ENDFOR -dst[MAX:128] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MAX(a, 16) - -
immintrin.h
-
- - Floating Point - AVX512_BF16 AVX512F - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 15 - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) -ENDFOR -dst[MAX:256] := 0 +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MAX(a, 8) - -
immintrin.h
-
- - Floating Point - AVX512_BF16 AVX512F - Convert - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 15 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := src.word[j] +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) FI -ENDFOR -dst[MAX:256] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MAX(a, 8) - -
immintrin.h
-
- - Floating Point - AVX512_BF16 AVX512F - Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 15 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := 0 +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) FI -ENDFOR -dst[MAX:256] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MAX(a, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Arithmetic - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". + Special Math Functions + + + + + Reduce the packed signed 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) } -dst := src -FOR j := 0 to 3 - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := REDUCE_MIN(a, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Arithmetic - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Special Math Functions + + + + + Reduce the packed signed 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 3 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) - ELSE - dst.dword[j] := src.dword[j] +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) FI -ENDFOR -dst[MAX:128] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MIN(a, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Arithmetic - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Special Math Functions + + + + + Reduce the packed unsigned 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 3 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) - ELSE - dst.dword[j] := 0 +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) FI -ENDFOR -dst[MAX:128] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MIN(a, 16) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Arithmetic - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". + Special Math Functions + + + + + Reduce the packed unsigned 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) } -dst := src -FOR j := 0 to 7 - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) -ENDFOR -dst[MAX:256] := 0 +dst[63:0] := REDUCE_MIN(a, 8) - + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Arithmetic - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y + Special Math Functions + + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) } -dst := src -FOR j := 0 to 7 +dst[63:0] := REDUCE_MIN(a, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MIN(a, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI ELSE - dst.dword[j] := src.dword[j] + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 - AVX512VL - Arithmetic - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 7 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) +FOR j := 0 to 15 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 ELSE - dst.dword[j] := 0 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512_BF16 + AVX512F - Arithmetic - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src FOR j := 0 to 15 - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - AVX512_BF16 + Shift + + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F - Arithmetic - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src FOR j := 0 to 15 + i := j*32 IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI ELSE - dst.dword[j] := src.dword[j] + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - AVX512_BF16 + AVX512F - Arithmetic - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src FOR j := 0 to 15 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) ELSE - dst.dword[j] := 0 + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512_BITALG - Bit Manipulation - - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR i := 0 to 7 //Qword - FOR j := 0 to 7 // Byte - IF k[i*8+j] - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) ELSE - dst[i*8+j] := 0 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) FI - ENDFOR + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512_BITALG - Bit Manipulation - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". + Shift + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". -FOR i := 0 to 7 //Qword - FOR j := 0 to 7 // Byte - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] - ENDFOR +FOR j := 0 to 15 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI ENDFOR -dst[MAX:64] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512_BITALG - AVX512VL - Bit Manipulation - - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR i := 0 to 3 //Qword - FOR j := 0 to 7 // Byte - IF k[i*8+j] - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 ELSE - dst[i*8+j] := 0 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) FI - ENDFOR + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:32] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512_BITALG - AVX512VL - Bit Manipulation - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". + Shift + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". -FOR i := 0 to 3 //Qword - FOR j := 0 to 7 // Byte - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] - ENDFOR +FOR j := 0 to 15 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI ENDFOR -dst[MAX:32] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512_BITALG - AVX512VL - Bit Manipulation - - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Shift + + + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR i := 0 to 1 //Qword - FOR j := 0 to 7 // Byte - IF k[i*8+j] - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) ELSE - dst[i*8+j] := 0 + dst[i+31:i] := 0 FI - ENDFOR + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR -dst[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - Mask - AVX512_BITALG - AVX512VL - Bit Manipulation - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". + Shift + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". -FOR i := 0 to 1 //Qword - FOR j := 0 to 7 // Byte - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] - ENDFOR +FOR j := 0 to 15 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ENDFOR -dst[MAX:16] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_BITALG - Bit Manipulation - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". + Shift + + + + + Cast vector of type __m512d to type __m512. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512d to type __m512i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m512d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m512i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m512d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m512. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := POPCNT(a[i+15:i]) +FOR j := 0 to 7 + i := j*32 + n := j*64 + dst[n+63:n] := Convert_FP32_To_FP64(v2[i+31:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_BITALG - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Convert + + + + + + + Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 7 + i := j*32 + l := j*64 IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) + dst[l+63:l] := Convert_FP32_To_FP64(v2[i+31:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[l+63:l] := src[l+63:l] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_BITALG - Bit Manipulation - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Convert + + + + + Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 7 + i := j*32 + l := j*64 + dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := j*64 IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) + dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) ELSE - dst[i+15:i] := 0 + dst[n+63:n] := src[n+63:n] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_BITALG - AVX512VL - Bit Manipulation - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". + Convert + + + + + Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := POPCNT(a[i+15:i]) +FOR j := 0 to 7 + i := j*32 + n := j*64 + dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_BITALG - AVX512VL - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Convert + + + + + + + Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 7 + i := j*32 + l := j*64 IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) + dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) ELSE - dst[i+15:i] := src[i+15:i] + dst[l+63:l] := src[l+63:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_BITALG - AVX512VL - Bit Manipulation - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Convert + + + + + Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst". The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 7 + i := j*64 + k := j*32 + dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. + +FOR j := 0 to 7 + i := j*64 + l := j*32 IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) + dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i]) ELSE - dst[i+15:i] := 0 + dst[l+31:l] := src[l+31:l] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_BITALG - AVX512VL - Bit Manipulation - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". + Convert + + + + + + + + Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := POPCNT(a[i+15:i]) + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] ENDFOR -dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_BITALG - AVX512VL - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Store + + + + + + + + + Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements whose corresponding mask bit is not set are not written to memory). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} FOR j := 0 to 7 - i := j*16 + i := j*64 + m := j*32 IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 - + + AVX512F
immintrin.h
-
- - Integer - AVX512_BITALG + Store + + + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512IFMA52 AVX512VL - Bit Manipulation - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ELSE - dst[i+15:i] := 0 + dst[i+63:i] := a[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512IFMA52 + AVX512VL
immintrin.h
-
- - Integer - AVX512_BITALG - Bit Manipulation - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". + Arithmetic + + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := POPCNT(a[i+7:i]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512IFMA52 + AVX512VL
immintrin.h
-
- - Integer - AVX512_BITALG - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ELSE - dst[i+7:i] := src[i+7:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512IFMA52 + AVX512VL
immintrin.h
-
- - Integer - AVX512_BITALG - Bit Manipulation - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ELSE - dst[i+7:i] := 0 + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_BITALG + + AVX512IFMA52 AVX512VL - Bit Manipulation - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := POPCNT(a[i+7:i]) +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_BITALG + + AVX512IFMA52 AVX512VL - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE - dst[i+7:i] := src[i+7:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_BITALG + + AVX512IFMA52 AVX512VL - Bit Manipulation - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 3 + i := j*64 IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE - dst[i+7:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_BITALG + + AVX512IFMA52 AVX512VL - Bit Manipulation - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := POPCNT(a[i+7:i]) +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_BITALG + + AVX512IFMA52 AVX512VL - Bit Manipulation - - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE - dst[i+7:i] := src[i+7:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_BITALG + + AVX512IFMA52 AVX512VL - Bit Manipulation - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 1 + i := j*64 IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ELSE - dst[i+7:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - + + AVX512IFMA52 + AVX512VL
immintrin.h
-
- - AVX512_VBMI - Bit Manipulation - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". + Arithmetic + + + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". -FOR i := 0 to 7 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ENDFOR +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) ENDFOR dst[MAX:512] := 0 - + + AVX512IFMA52
immintrin.h
-
- - AVX512_VBMI - Bit Manipulation - - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR i := 0 to 7 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] - FI - ENDFOR +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) + ELSE + dst[i+63:i] := a[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512IFMA52
immintrin.h
-
- - AVX512_VBMI - Bit Manipulation - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR i := 0 to 7 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := 0 - FI - ENDFOR +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR dst[MAX:512] := 0 - + + AVX512IFMA52
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Bit Manipulation - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". + Arithmetic + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". -FOR i := 0 to 3 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ENDFOR +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512IFMA52
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Bit Manipulation - - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR i := 0 to 3 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] - FI - ENDFOR +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) + ELSE + dst[i+63:i] := a[i+63:i] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512IFMA52
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Bit Manipulation - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR i := 0 to 3 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := 0 - FI - ENDFOR +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) + ELSE + dst[i+63:i] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512IFMA52
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Bit Manipulation - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". + Arithmetic + + + + + + + + + + Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache. "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -FOR i := 0 to 1 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ENDFOR +FOR j:= 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) ENDFOR -dst[MAX:128] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Bit Manipulation - - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Load + + + + + + + + + Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are only brought into cache when their corresponding mask bit is set). "scale" should be 1, 2, 4 or 8.. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -FOR i := 0 to 1 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] - FI - ENDFOR +FOR j:= 0 to 7 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) + FI ENDFOR -dst[MAX:128] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Bit Manipulation - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Load + + + + + + + + Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache. "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -FOR i := 0 to 1 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := 0 - FI - ENDFOR +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+63:addr], hint) ENDFOR -dst[MAX:128] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - Swizzle - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + Load + + + + + + + + + Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -FOR j := 0 to 63 - i := j*8 - id := idx[i+5:i]*8 - dst[i+7:i] := a[id+7:id] +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+63:addr], hint) + FI ENDFOR -dst[MAX:512] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - Swizzle - - - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Load + + + + + + + + Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by "hint" using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -FOR j := 0 to 63 - i := j*8 - id := idx[i+5:i]*8 - IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := src[i+7:i] - FI +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+63:addr], hint) ENDFOR -dst[MAX:512] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - Swizzle - - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Load + + + + + + + + + Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by "hint" using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Prefetched elements are merged in cache using writemask "k" (elements are copied from memory when the corresponding mask bit is set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -FOR j := 0 to 63 - i := j*8 - id := idx[i+5:i]*8 +FOR j := 0 to 7 + i := j*64 + m := j*64 IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := 0 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+63:addr], hint) FI ENDFOR -dst[MAX:512] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + Load + + + + + + + + Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. Elements are prefetched into cache level "hint", where "hint" is 0 or 1. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 31 - i := j*8 - id := idx[i+4:i]*8 - dst[i+7:i] := a[id+7:id] +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) ENDFOR -dst[MAX:256] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Store + + + + + + + + + Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 31 - i := j*8 - id := idx[i+4:i]*8 +FOR j := 0 to 7 + i := j*64 + m := j*64 IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := src[i+7:i] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) FI ENDFOR -dst[MAX:256] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Store + + + + + + + + Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 31 - i := j*8 - id := idx[i+4:i]*8 +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+63:addr], hint) +ENDFOR + + + + AVX512PF +
immintrin.h
+ Store +
+ + + + + + + + Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := 0 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+63:addr], hint) FI ENDFOR -dst[MAX:256] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst". + Store + + + + + + + + Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 - i := j*8 - id := idx[i+3:i]*8 - dst[i+7:i] := a[id+7:id] +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+63:addr], hint) ENDFOR -dst[MAX:128] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - - - Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Store + + + + + + + + + Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -FOR j := 0 to 15 - i := j*8 - id := idx[i+3:i]*8 +FOR j := 0 to 7 + i := j*64 + m := j*64 IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := src[i+7:i] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+63:addr], hint) FI ENDFOR -dst[MAX:128] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - - Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Store + + + + + + + + + Prefetch single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. FOR j := 0 to 15 - i := j*8 - id := idx[i+3:i]*8 + i := j*32 + m := j*32 IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := 0 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) FI ENDFOR -dst[MAX:128] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - Swizzle - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + Load + + + + + + + + + Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. +The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. -FOR j := 0 to 63 - i := j*8 - off := 8*idx[i+5:i] - dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) ENDFOR -dst[MAX:512] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - Swizzle - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Load + + + + + + + + + + Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. +The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + i := j*32 + m := j*32 IF k[j] - off := 8*idx[i+5:i] - dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := a[i+7:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) FI ENDFOR -dst[MAX:512] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - Swizzle - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Load + + + + + + + + Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -FOR j := 0 to 63 - i := j*8 - IF k[j] - off := 8*idx[i+5:i] - dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := idx[i+7:i] - FI +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) ENDFOR -dst[MAX:512] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - Swizzle - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Load + + + + + + + + + Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint", with a request for exclusive ownership. The "hint" parameter may be one of the following: _MM_HINT_T0 = 1 for prefetching to L1 cache, _MM_HINT_T1 = 2 for prefetching to L2 cache, _MM_HINT_T2 = 3 for prefetching to L2 cache non-temporal, _MM_HINT_NTA = 0 for prefetching to L1 cache non-temporal. The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent scatter intrinsic. -FOR j := 0 to 63 - i := j*8 - IF k[j] - off := 8*idx[i+5:i] - dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := 0 - FI +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) ENDFOR -dst[MAX:512] := 0 - - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + Store + + + + + + + + + + Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. +The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. Only those elements whose corresponding mask bit in "k" is set are loaded into cache. -FOR j := 0 to 31 - i := j*8 - off := 8*idx[i+4:i] - dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] +cachev := 0 +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) + FI ENDFOR -dst[MAX:256] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Store + + + + + + + + Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. -FOR j := 0 to 31 - i := j*8 - IF k[j] - off := 8*idx[i+4:i] - dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := a[i+7:i] - FI +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) ENDFOR -dst[MAX:256] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Store + + + + + + + + + Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. Only those elements whose corresponding mask bit in "k" is set are loaded into the desired cache. -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 + i := j*32 + m := j*32 IF k[j] - off := 8*idx[i+4:i] - dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := idx[i+7:i] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) FI ENDFOR -dst[MAX:256] := 0 - + + + AVX512PF
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Store + + + + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*8 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*64 IF k[j] - off := 8*idx[i+4:i] - dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] + dst[i+63:i] := POPCNT(a[i+63:i]) ELSE - dst[i+7:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0 - - -
immintrin.h
-
- - AVX512_VBMI + + AVX512VPOPCNTDQ AVX512VL - Swizzle - - - - - Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - off := 8*idx[i+3:i] - dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - AVX512_VBMI - AVX512VL - Swizzle - - - - - - Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Bit Manipulation + + + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*64 IF k[j] - off := 8*idx[i+3:i] - dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] + dst[i+63:i] := POPCNT(a[i+63:i]) ELSE - dst[i+7:i] := a[i+7:i] + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512_VBMI + + AVX512VPOPCNTDQ AVX512VL - Swizzle - - - - - - Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". -FOR j := 0 to 15 - i := j*8 - IF k[j] - off := 8*idx[i+3:i] - dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := idx[i+7:i] - FI +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POPCNT(a[i+63:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - AVX512_VBMI + + AVX512VPOPCNTDQ AVX512VL - Swizzle - - - - - - Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*8 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 1 + i := j*64 IF k[j] - off := 8*idx[i+3:i] - dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] + dst[i+63:i] := POPCNT(a[i+63:i]) ELSE - dst[i+7:i] := 0 + dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0 - - + + AVX512VPOPCNTDQ + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Bit Manipulation + + + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 1 i := j*64 IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + dst[i+63:i] := POPCNT(a[i+63:i]) ELSE - dst[i+63:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512VPOPCNTDQ + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Bit Manipulation + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". -FOR j := 0 to 7 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 1 i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) - ELSE - dst[i+63:i] := a[i+63:i] - FI + dst[i+63:i] := POPCNT(a[i+63:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512VPOPCNTDQ + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". + Bit Manipulation + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + i := j*32 + dst[i+31:i] := POPCNT(a[i+31:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512VPOPCNTDQ AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + dst[i+31:i] := POPCNT(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512VPOPCNTDQ AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*32 IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + dst[i+31:i] := POPCNT(a[i+31:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512VPOPCNTDQ AVX512VL - Shift - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + i := j*32 + dst[i+31:i] := POPCNT(a[i+31:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512VPOPCNTDQ AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + dst[i+31:i] := POPCNT(a[i+31:i]) ELSE - dst[i+63:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512VPOPCNTDQ AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*32 IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + dst[i+31:i] := POPCNT(a[i+31:i]) ELSE - dst[i+63:i] := a[i+63:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512VPOPCNTDQ AVX512VL - Shift - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POPCNT(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512VPOPCNTDQ
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Bit Manipulation + + + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + dst[i+31:i] := POPCNT(a[i+31:i]) ELSE - dst[i+31:i] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512VPOPCNTDQ
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Bit Manipulation + + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + dst[i+31:i] := POPCNT(a[i+31:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512VPOPCNTDQ
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". + Bit Manipulation + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POPCNT(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512VPOPCNTDQ
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Bit Manipulation + + + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + dst[i+63:i] := POPCNT(a[i+63:i]) ELSE - dst[i+31:i] := 0 + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512VPOPCNTDQ
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Bit Manipulation + + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 7 - i := j*32 + i := j*64 IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + dst[i+63:i] := POPCNT(a[i+63:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+63:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512VPOPCNTDQ
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". + Bit Manipulation + + + + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate with the corresponding elements in "src", and store the results in "dst". -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) +dst[511:0] := src[511:0] +FOR i := 0 to 15 + FOR m := 0 to 3 + addr := b + m * 32 + dst.fp32[i] := dst.fp32[i] + a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) + ENDFOR ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate with the corresponding elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) - ELSE - dst[i+31:i] := 0 - FI +dst[511:0] := src[511:0] +FOR i := 0 to 15 + FOR m := 0 to 3 + addr := b + m * 32 + IF k[i] + dst.fp32[i] := dst.fp32[i] + a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) + FI + ENDFOR ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate with the corresponding elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) - ELSE - dst[i+31:i] := a[i+31:i] - FI +dst[511:0] := src[511:0] +FOR i := 0 to 15 + FOR m := 0 to 3 + addr := b + m * 32 + IF k[i] + dst.fp32[i] := dst.fp32[i] + a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) + ELSE + dst.fp32[i] := 0 + FI + ENDFOR ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". + Arithmetic + + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate the negated intermediate result with the corresponding elements in "src", and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) +dst[511:0] := src[511:0] +FOR i := 0 to 15 + FOR m := 0 to 3 + addr := b + m * 32 + dst.fp32[i] := dst.fp32[i] - a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) + ENDFOR ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate the negated intermediate result with the corresponding elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := 0 - FI +dst[511:0] := src[511:0] +FOR i := 0 to 15 + FOR m := 0 to 3 + addr := b + m * 32 + IF k[i] + dst.fp32[i] := dst.fp32[i] - a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) + FI + ENDFOR ENDFOR dst[MAX:512] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate the negated intermediate result with the corresponding elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := a[i+15:i] - FI +dst[511:0] := src[511:0] +FOR i := 0 to 15 + FOR m := 0 to 3 + addr := b + m * 32 + IF k[i] + dst.fp32[i] := dst.fp32[i] - a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr]) + ELSE + dst.fp32[i] := 0 + FI + ENDFOR ENDFOR dst[MAX:512] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". + Arithmetic + + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate with the lower element in "a", and store the result in the lower element of "dst". -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) +dst[127:0] := src[127:0] +FOR m := 0 to 3 + addr := b + m * 32 + dst.fp32[0] := dst.fp32[0] + a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate with the lower element in "a", and store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set). -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 +dst[127:0] := src[127:0] +IF k[0] + FOR m := 0 to 3 + addr := b + m * 32 + dst.fp32[0] := dst.fp32[0] + a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) + ENDFOR +FI +dst[MAX:128] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate with the lower element in "a", and store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set). -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 +dst[127:0] := src[127:0] +IF k[0] + FOR m := 0 to 3 + addr := b + m * 32 + dst.fp32[0] := dst.fp32[0] + a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) + ENDFOR +ELSE + dst.fp32[0] := 0 +FI +dst[MAX:128] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". + Arithmetic + + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate the negated intermediate result with the lower element in "src", and store the result in the lower element of "dst". -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) +dst[127:0] := src[127:0] +FOR m := 0 to 3 + addr := b + m * 32 + dst.fp32[0] := dst.fp32[0] - a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate the negated intermediate result with the lower element in "src", and store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set). -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR +dst[127:0] := src[127:0] +IF k[0] + FOR m := 0 to 3 + addr := b + m * 32 + dst.fp32[0] := dst.fp32[0] - a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) + ENDFOR +FI dst[MAX:128] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate the negated intermediate result with the lower element in "src", and store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set). -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR +dst[127:0] := src[127:0] +IF k[0] + FOR m := 0 to 3 + addr := b + m * 32 + dst.fp32[0] := dst.fp32[0] - a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr]) + ENDFOR +ELSE + dst.fp32[0] := 0 +FI dst[MAX:128] := 0 - + + AVX512_4FMAPS
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". + Arithmetic + + + + + + + + + + + + Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation, and store the results in "dst". -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) +dst[511:0] := src[511:0] +FOR i := 0 to 15 + FOR m := 0 to 3 + lim_base := b + m*32 + t.dword := MEM[lim_base+31:lim_base] + p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) + p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) + dst.dword[i] := dst.dword[i] + p1.dword + p2.dword + ENDFOR ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512_4VNNIW
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +dst[511:0] := src[511:0] +FOR i := 0 to 15 + IF k[i] + FOR m := 0 to 3 + lim_base := b + m*32 + t.dword := MEM[lim_base+31:lim_base] + p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) + p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) + dst.dword[i] := dst.dword[i] + p1.dword + p2.dword + ENDFOR ELSE - dst[i+63:i] := 0 + dst.dword[i] := src.dword[i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512_4VNNIW
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +dst[511:0] := src[511:0] +FOR i := 0 to 15 + IF k[i] + FOR m := 0 to 3 + lim_base := b + m*32 + t.dword := MEM[lim_base+31:lim_base] + p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) + p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) + dst.dword[i] := dst.dword[i] + p1.dword + p2.dword + ENDFOR ELSE - dst[i+63:i] := src[i+63:i] + dst.dword[i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512_4VNNIW
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". + Arithmetic + + + + + + + + + + Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation and signed saturation, and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +dst[511:0] := src[511:0] +FOR i := 0 to 15 + FOR m := 0 to 3 + lim_base := b + m*32 + t.dword := MEM[lim_base+31:lim_base] + p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) + p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) + dst.dword[i] := Saturate32(dst.dword[i] + p1.dword + p2.dword) + ENDFOR ENDFOR dst[MAX:512] := 0 - + + AVX512_4VNNIW
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask and signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).. -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +dst[511:0] := src[511:0] +FOR i := 0 to 15 + IF k[i] + FOR m := 0 to 3 + lim_base := b + m*32 + t.dword := MEM[lim_base+31:lim_base] + p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) + p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) + dst.dword[i] := Saturate32(dst.dword[i] + p1.dword + p2.dword) + ENDFOR ELSE - dst[i+63:i] := 0 + dst.dword[i] := src.dword[i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512_4VNNIW
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + + + + Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask and signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).. -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +dst[511:0] := src[511:0] +FOR i := 0 to 15 + IF k[i] + FOR m := 0 to 3 + lim_base := b + m*32 + t.dword := MEM[lim_base+31:lim_base] + p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0])) + p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1])) + dst.dword[i] := Saturate32(dst.dword[i] + p1.dword + p2.dword) + ENDFOR ELSE - dst[i+63:i] := src[i+63:i] + dst.dword[i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512_4VNNIW
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". + Arithmetic + + + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +FOR j := 0 to 31 + IF j < 16 + t := b.fp32[j] + ELSE + t := a.fp32[j-16] + FI + dst.word[j] := Convert_FP32_To_BF16(t) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512_BF16 + AVX512F
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Convert + + + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 31 IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + IF j < 16 + t := b.fp32[j] + ELSE + t := a.fp32[j-16] + FI + dst.word[j] := Convert_FP32_To_BF16(t) ELSE - dst[i+63:i] := 0 + dst.word[j] := src.word[j] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512_BF16 + AVX512F
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 to 31 IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + IF j < 16 + t := b.fp32[j] + ELSE + t := a.fp32[j-16] + FI + dst.word[j] := Convert_FP32_To_BF16(t) ELSE - dst[i+63:i] := src[i+63:i] + dst.word[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512_BF16 + AVX512F
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". + Convert + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +FOR j := 0 to 15 + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512_BF16 + AVX512F
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*32 IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) ELSE - dst[i+31:i] := 0 + dst.word[j] := src.word[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512_BF16 + AVX512F
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*32 IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) ELSE - dst[i+31:i] := src[i+31:i] + dst.word[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512_BF16 + AVX512F
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". + Convert + + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] -ENDFOR +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 15 + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) +ENDFOR dst[MAX:512] := 0 - + + AVX512_BF16 + AVX512F
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 15 IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) ELSE - dst[i+31:i] := 0 + dst.dword[j] := src.dword[j] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512_BF16 + AVX512F
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 15 IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) ELSE - dst[i+31:i] := src[i+31:i] + dst.dword[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512_BF16 + AVX512F
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". + Arithmetic + + + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] -ENDFOR -dst[MAX:256] := 0 - - -
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + IF j < 4 + t := b.fp32[j] ELSE - dst[i+31:i] := 0 + t := a.fp32[j-4] FI + dst.word[j] := Convert_FP32_To_BF16(t) ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + IF j < 4 + t := b.fp32[j] + ELSE + t := a.fp32[j-4] + FI + dst.word[j] := Convert_FP32_To_BF16(t) ELSE - dst[i+31:i] := src[i+31:i] + dst.word[j] := src.word[j] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 7 IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + IF j < 4 + t := b.fp32[j] + ELSE + t := a.fp32[j-4] + FI + dst.word[j] := Convert_FP32_To_BF16(t) ELSE - dst[i+15:i] := 0 + dst.word[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_BF16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] +FOR j := 0 to 15 + IF j < 8 + t := b.fp32[j] ELSE - dst[i+15:i] := src[i+15:i] + t := a.fp32[j-8] FI + dst.word[j] := Convert_FP32_To_BF16(t) ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] -ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + IF j < 8 + t := b.fp32[j] + ELSE + t := a.fp32[j-8] + FI + dst.word[j] := Convert_FP32_To_BF16(t) ELSE - dst[i+15:i] := 0 + dst.word[j] := src.word[j] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + IF j < 8 + t := b.fp32[j] + ELSE + t := a.fp32[j-8] + FI + dst.word[j] := Convert_FP32_To_BF16(t) ELSE - dst[i+15:i] := src[i+15:i] + dst.word[j] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] +FOR j := 0 to 3 + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 3 IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) ELSE - dst[i+15:i] := 0 + dst.word[j] := src.word[j] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*16 +FOR j := 0 to 3 IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) ELSE - dst[i+15:i] := src[i+15:i] + dst.word[j] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) ENDFOR dst[MAX:128] := 0 - + + AVX512_BF16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) ELSE - dst[i+63:i] := 0 + dst.word[j] := src.word[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_BF16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*64 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) ELSE - dst[i+63:i] := a[i+63:i] + dst.word[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_BF16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". + Convert + + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 3 + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src FOR j := 0 to 3 - i := j*64 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) ELSE - dst[i+63:i] := 0 + dst.dword[j] := src.dword[j] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src FOR j := 0 to 3 - i := j*64 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) ELSE - dst[i+63:i] := a[i+63:i] + dst.dword[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 7 + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 7 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) ELSE - dst[i+63:i] := 0 + dst.dword[j] := src.dword[j] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 7 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) ELSE - dst[i+63:i] := a[i+63:i] + dst.dword[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BF16 AVX512VL - Shift - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] +FOR i := 0 to 3 //Qword + FOR j := 0 to 7 // Byte + IF k[i*8+j] + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ELSE + dst[i*8+j] := 0 + FI + ENDFOR ENDFOR -dst[MAX:128] := 0 +dst[MAX:32] := 0 - + + AVX512_BITALG + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Bit Manipulation + + + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := 0 - FI +FOR i := 0 to 3 //Qword + FOR j := 0 to 7 // Byte + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ENDFOR ENDFOR -dst[MAX:512] := 0 +dst[MAX:32] := 0 - + + AVX512_BITALG + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Bit Manipulation + + + + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := a[i+31:i] - FI +FOR i := 0 to 1 //Qword + FOR j := 0 to 7 // Byte + IF k[i*8+j] + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ELSE + dst[i*8+j] := 0 + FI + ENDFOR ENDFOR -dst[MAX:512] := 0 +dst[MAX:16] := 0 - + + AVX512_BITALG + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". + Bit Manipulation + + + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". -FOR j := 0 to 15 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] +FOR i := 0 to 1 //Qword + FOR j := 0 to 7 // Byte + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ENDFOR ENDFOR -dst[MAX:512] := 0 +dst[MAX:16] := 0 - + + AVX512_BITALG + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 + Bit Manipulation + + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := POPCNT(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BITALG AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*16 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] + dst[i+15:i] := POPCNT(a[i+15:i]) ELSE - dst[i+31:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BITALG AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*16 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] + dst[i+15:i] := POPCNT(a[i+15:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BITALG AVX512VL - Shift - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 7 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] + i := j*16 + dst[i+15:i] := POPCNT(a[i+15:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BITALG AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*16 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] + dst[i+15:i] := POPCNT(a[i+15:i]) ELSE - dst[i+31:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BITALG AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*16 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] + dst[i+15:i] := POPCNT(a[i+15:i]) ELSE - dst[i+31:i] := a[i+31:i] + dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BITALG AVX512VL - Shift - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := POPCNT(a[i+7:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512_BITALG + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Bit Manipulation + + + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 31 - i := j*16 + i := j*8 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] + dst[i+7:i] := POPCNT(a[i+7:i]) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512_BITALG + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Bit Manipulation + + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 31 - i := j*16 + i := j*8 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] + dst[i+7:i] := POPCNT(a[i+7:i]) ELSE - dst[i+15:i] := a[i+15:i] + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512_BITALG + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". + Bit Manipulation + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := POPCNT(a[i+7:i]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BITALG AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 15 - i := j*16 + i := j*8 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] + dst[i+7:i] := POPCNT(a[i+7:i]) ELSE - dst[i+15:i] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BITALG AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} FOR j := 0 to 15 - i := j*16 + i := j*8 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] + dst[i+7:i] := POPCNT(a[i+7:i]) ELSE - dst[i+15:i] := a[i+15:i] + dst[i+7:i] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_BITALG AVX512VL - Shift - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". +
immintrin.h
+ Bit Manipulation +
+ + + + + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 +FOR i := 0 to 7 //Qword + FOR j := 0 to 7 // Byte + IF k[i*8+j] + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ELSE + dst[i*8+j] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:64] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". + +FOR i := 0 to 7 //Qword + FOR j := 0 to 7 // Byte + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ENDFOR +ENDFOR +dst[MAX:64] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] + dst[i+15:i] := POPCNT(a[i+15:i]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + AVX512_BITALG
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Bit Manipulation + + + + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] + dst[i+15:i] := POPCNT(a[i+15:i]) ELSE - dst[i+15:i] := 0 + dst[i+15:i] := src[i+15:i] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512_BITALG
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Bit Manipulation + + + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 i := j*16 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] + dst[i+15:i] := POPCNT(a[i+15:i]) ELSE - dst[i+15:i] := a[i+15:i] + dst[i+15:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512_BITALG
immintrin.h
-
- - Integer - AVX512_VBMI2 - AVX512VL - Shift - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". + Bit Manipulation + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := POPCNT(a[i+7:i]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:512] := 0 - + + AVX512_BITALG
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Bit Manipulation + + + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] + dst[i+7:i] := POPCNT(a[i+7:i]) ELSE - dst[i+63:i] := 0 + dst[i+7:i] := src[i+7:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512_BITALG
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Bit Manipulation + + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 63 + i := j*8 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] + dst[i+7:i] := POPCNT(a[i+7:i]) ELSE - dst[i+63:i] := src[i+63:i] + dst[i+7:i] := 0 FI ENDFOR dst[MAX:512] := 0 - + + AVX512_BITALG
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). + Bit Manipulation + + + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] +FOR j := 0 TO 7 + dst.fp16[j] := a.fp16[j] + b.fp16[j] ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 TO 7 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] + dst.fp16[j] := a.fp16[j] + b.fp16[j] ELSE - dst[i+63:i] := 0 + dst.fp16[j] := src.fp16[j] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Shift - - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 +FOR j := 0 TO 7 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] + dst.fp16[j] := a.fp16[j] + b.fp16[j] ELSE - dst[i+63:i] := src[i+63:i] + dst.fp16[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Shift - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). +
immintrin.h
+ Arithmetic +
+ + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] +FOR j := 0 TO 15 + dst.fp16[j] := a.fp16[j] + b.fp16[j] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Shift - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 TO 15 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] + dst.fp16[j] := a.fp16[j] + b.fp16[j] ELSE - dst[i+63:i] := 0 + dst.fp16[j] := src.fp16[j] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Shift - - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 +FOR j := 0 TO 15 IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] + dst.fp16[j] := a.fp16[j] + b.fp16[j] ELSE - dst[i+63:i] := src[i+63:i] + dst.fp16[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Shift - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] +FOR j := 0 to 7 + dst.fp16[j] := a.fp16[j] / b.fp16[j] ENDFOR dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] + dst.fp16[j] := a.fp16[j] / b.fp16[j] ELSE - dst[i+31:i] := 0 + dst.fp16[j] := src.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] + dst.fp16[j] := a.fp16[j] / b.fp16[j] ELSE - dst[i+31:i] := src[i+31:i] + dst.fp16[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". + Arithmetic + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". FOR j := 0 to 15 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] + dst.fp16[j] := a.fp16[j] / b.fp16[j] ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 15 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] + dst.fp16[j] := a.fp16[j] / b.fp16[j] ELSE - dst[i+31:i] := 0 + dst.fp16[j] := src.fp16[j] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Shift - - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 +FOR j := 0 to 15 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] + dst.fp16[j] := a.fp16[j] / b.fp16[j] ELSE - dst[i+31:i] := src[i+31:i] + dst.fp16[j] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Shift - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". FOR j := 0 to 7 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Shift - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+31:i] := 0 + dst.fp16[j] := a.fp16[j] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Shift - - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 +FOR j := 0 to 7 IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+31:i] := src[i+31:i] + dst.fp16[j] := c.fp16[j] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Shift - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". - -FOR j := 0 to 3 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] -ENDFOR -dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 7 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+15:i] := 0 + dst.fp16[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI +FOR j := 0 to 15 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Shift - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+15:i] := 0 + dst.fp16[j] := c.fp16[j] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Shift - - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 - i := j*16 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+15:i] := src[i+15:i] + dst.fp16[j] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Shift - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] +FOR j := 0 to 7 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Shift - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*16 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+15:i] := 0 + dst.fp16[j] := a.fp16[j] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Shift - - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*16 IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+15:i] := src[i+15:i] + dst.fp16[j] := c.fp16[j] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Shift - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI ENDFOR dst[MAX:128] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Load - Swizzle - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". -m := 0 -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+15:i] := 0 + dst.fp16[j] := a.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Load - Swizzle - - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+15:i] := src[i+15:i] + dst.fp16[j] := c.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Swizzle - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+15:i] := 0 + dst.fp16[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Swizzle - - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". -m := 0 -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 7 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+15:i] := src[i+15:i] + dst.fp16[j] := a.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 AVX512VL - Load - Swizzle - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 7 IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+15:i] := 0 + dst.fp16[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 + Arithmetic + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 AVX512VL - Load - Swizzle - - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 15 - i := j*16 IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+15:i] := src[i+15:i] + dst.fp16[j] := a.fp16[j] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 15 - i := j*16 IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+15:i] := 0 + dst.fp16[j] := c.fp16[j] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 15 - i := j*16 IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+15:i] := src[i+15:i] + dst.fp16[j] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Load - Swizzle - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". -m := 0 FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 - ELSE - dst[i+15:i] := 0 - FI + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Load - Swizzle - - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 7 - i := j*16 IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+15:i] := src[i+15:i] + dst.fp16[j] := a.fp16[j] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 7 - i := j*16 IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+15:i] := 0 + dst.fp16[j] := c.fp16[j] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 7 - i := j*16 IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+15:i] := src[i+15:i] + dst.fp16[j] := 0 FI ENDFOR dst[MAX:128] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Load - Swizzle - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". -m := 0 -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+7:i] := 0 + dst.fp16[j] := a.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Load - Swizzle - - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 15 IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+7:i] := src[i+7:i] + dst.fp16[j] := c.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Load - Swizzle - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+7:i] := 0 + dst.fp16[j] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Load - Swizzle - - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". -m := 0 -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 +FOR j := 0 to 7 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+7:i] := src[i+7:i] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Load - Swizzle - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 7 IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI ELSE - dst[i+7:i] := 0 + dst.fp16[j] := a.fp16[j] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Load - Swizzle - - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 15 - i := j*8 +FOR j := 0 to 7 IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI ELSE - dst[i+7:i] := src[i+7:i] + dst.fp16[j] := c.fp16[j] FI ENDFOR dst[MAX:128] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Swizzle - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 63 - i := j*8 +FOR j := 0 to 7 IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI ELSE - dst[i+7:i] := 0 + dst.fp16[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Swizzle - - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Arithmetic + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". -m := 0 -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 +FOR j := 0 to 15 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] ELSE - dst[i+7:i] := src[i+7:i] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI ELSE - dst[i+7:i] := 0 + dst.fp16[j] := a.fp16[j] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -m := 0 -FOR j := 0 to 31 - i := j*8 +FOR j := 0 to 15 IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI ELSE - dst[i+7:i] := src[i+7:i] + dst.fp16[j] := c.fp16[j] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -m := 0 FOR j := 0 to 15 - i := j*8 IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI ELSE - dst[i+7:i] := 0 + dst.fp16[j] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". -m := 0 -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 +FOR j := 0 to 7 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] ELSE - dst[i+7:i] := src[i+7:i] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] FI ENDFOR dst[MAX:128] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Store - Swizzle - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -size := 16 -m := base_addr -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 7 IF k[j] - MEM[m+size-1:m] := a[i+15:i] - m := m + size + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Store - Swizzle - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -size := 16 -m := base_addr -FOR j := 0 to 15 - i := j*16 +FOR j := 0 to 7 IF k[j] - MEM[m+size-1:m] := a[i+15:i] - m := m + size + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] FI ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Store - Swizzle - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 16 -m := base_addr FOR j := 0 to 7 - i := j*16 IF k[j] - MEM[m+size-1:m] := a[i+15:i] - m := m + size + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 FI ENDFOR +dst[MAX:128] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Swizzle - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + Arithmetic + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". -size := 16 -m := 0 -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size +FOR j := 0 to 15 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] FI ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Swizzle - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -size := 16 -m := 0 -FOR j := 0 to 31 - i := j*16 +FOR j := 0 to 15 IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] FI ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -size := 16 -m := 0 FOR j := 0 to 15 - i := j*16 IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] FI ENDFOR -dst[255:m] := 0 dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 16 -m := 0 FOR j := 0 to 15 - i := j*16 IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 FI ENDFOR -dst[255:m] := src[255:m] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + + + AVX512_FP16 AVX512VL - Swizzle - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". -size := 16 -m := 0 -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size - FI +FOR j := 0 TO 7 + dst.fp16[j] := a.fp16[j] - b.fp16[j] ENDFOR -dst[127:m] := 0 dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Swizzle - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 16 -m := 0 -FOR j := 0 to 7 - i := j*16 +FOR j := 0 TO 7 IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] FI ENDFOR -dst[127:m] := src[127:m] dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Store - Swizzle - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + Arithmetic + + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 8 -m := base_addr -FOR j := 0 to 63 - i := j*8 +FOR j := 0 TO 7 IF k[j] - MEM[m+size-1:m] := a[i+7:i] - m := m + size + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := 0 FI ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 + Arithmetic + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 TO 15 + dst.fp16[j] := a.fp16[j] - b.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 AVX512VL - Store - Swizzle - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 8 -m := base_addr -FOR j := 0 to 31 - i := j*8 +FOR j := 0 TO 15 IF k[j] - MEM[m+size-1:m] := a[i+7:i] - m := m + size + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] FI ENDFOR +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Store - Swizzle - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 8 -m := base_addr -FOR j := 0 to 15 - i := j*8 +FOR j := 0 TO 15 IF k[j] - MEM[m+size-1:m] := a[i+7:i] - m := m + size + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := 0 FI ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Swizzle - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + Arithmetic + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". -size := 8 -m := 0 -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size - FI +FOR i := 0 TO 7 + dst.fp16[i] := a.fp16[i] * b.fp16[i] ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 - Swizzle - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + Arithmetic + + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 8 -m := 0 -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size +FOR i := 0 TO 7 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := src.fp16[i] FI ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Swizzle - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 8 -m := 0 -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size +FOR i := 0 TO 7 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := 0 FI ENDFOR -dst[255:m] := 0 -dst[MAX:256] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VBMI2 + Arithmetic + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR i := 0 TO 15 + dst.fp16[i] := a.fp16[i] * b.fp16[i] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 AVX512VL - Swizzle - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -size := 8 -m := 0 -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size +FOR i := 0 TO 15 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := src.fp16[i] FI ENDFOR -dst[255:m] := src[255:m] dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Swizzle - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -size := 8 -m := 0 -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size +FOR i := 0 TO 15 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := 0 FI ENDFOR -dst[127:m] := 0 -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VBMI2 + + AVX512_FP16 AVX512VL - Swizzle - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -size := 8 -m := 0 -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size - FI +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) ENDFOR -dst[127:m] := src[127:m] dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + + + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 15 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) ELSE - dst.dword[j] := 0 + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 15 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) ELSE - dst.dword[j] := src.dword[j] + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 15 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 7 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) - ELSE - dst.dword[j] := 0 +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 7 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) ELSE - dst.dword[j] := src.dword[j] + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". +
+ + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 7 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 3 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) ELSE - dst.dword[j] := 0 + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 3 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) ELSE - dst.dword[j] := src.dword[j] + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". +
+ + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 3 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 15 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) ELSE - dst.dword[j] := 0 + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 15 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) ELSE - dst.dword[j] := src.dword[j] + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 15 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 7 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ELSE - dst.dword[j] := 0 + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 7 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ELSE - dst.dword[j] := src.dword[j] + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI + Arithmetic + + + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". +
+ + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 7 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 3 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ELSE - dst.dword[j] := 0 + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". -FOR j := 0 to 3 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ELSE - dst.dword[j] := src.dword[j] + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI + Arithmetic + + + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 3 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ENDFOR dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 15 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ELSE - dst.dword[j] := 0 + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 15 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ELSE - dst.dword[j] := src.dword[j] + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 15 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI + Arithmetic + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 7 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ELSE - dst.dword[j] := 0 + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 7 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] ELSE - dst.dword[j] := src.dword[j] + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 7 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI ENDFOR dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI + Arithmetic + + + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +tmp := a +FOR i := 0 to 7 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+8] +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1] + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + Reduce the packed half-precision (316-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". -FOR j := 0 to 3 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) - ELSE - dst.dword[j] := 0 - FI +tmp := a +FOR i := 0 to 7 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+8] ENDFOR -dst[MAX:128] := 0 +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1] - + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI + Arithmetic + + + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". + +tmp := a +FOR i := 0 to 7 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". -FOR j := 0 to 3 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) - ELSE - dst.dword[j] := src.dword[j] - FI +tmp := a +FOR i := 0 to 7 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) ENDFOR -dst[MAX:128] := 0 +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) - + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI + Arithmetic + + + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +tmp := a +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1] + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". -FOR j := 0 to 3 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +tmp := a +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4] ENDFOR -dst[MAX:128] := 0 +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1] - + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + + + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 15 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := 0 - FI +tmp := a +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) ENDFOR -dst[MAX:512] := 0 +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) - + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". -FOR j := 0 to 15 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := src.dword[j] - FI +tmp := a +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) ENDFOR -dst[MAX:512] := 0 +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) - + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI Arithmetic - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + + + + + Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst". FOR j := 0 to 15 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + dst.fp16[j] := ABS(v2.fp16[j]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst". FOR j := 0 to 7 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := 0 - FI + dst.fp16[j] := ABS(v2.fp16[j]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". FOR j := 0 to 7 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := src.dword[j] - FI + i := j*32 + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) ENDFOR dst[MAX:256] := 0 - + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VNNI + Arithmetic + + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". +
+ + + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". FOR j := 0 to 7 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
+ + + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". FOR j := 0 to 3 + i := j*32 IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) ELSE - dst.dword[j] := 0 + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
+ + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". -FOR j := 0 to 3 +FOR j := 0 to 7 + i := j*32 IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) ELSE - dst.dword[j] := src.dword[j] + dst[i+31:i] := 0 FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512_VNNI + AVX512_FP16 AVX512VL +
immintrin.h
Arithmetic - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". +
+ + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". FOR j := 0 to 3 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := 0 + FI ENDFOR dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VP2INTERSECT + AVX512_FP16 AVX512VL - Mask - - - - - - Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. - -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 3 - FOR j := 0 TO 3 - match := (a.dword[i] == b.dword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - -
immintrin.h
-
- - Integer - AVX512_VP2INTERSECT + + AVX512_FP16 AVX512VL - Mask - - - - - - Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. - -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 7 - FOR j := 0 TO 7 - match := (a.dword[i] == b.dword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + IF k1[j] + k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR +k[MAX:8] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VP2INTERSECT - AVX512F - Mask - - - - - - Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. - -MEM[k1+15:k1] := 0 -MEM[k2+15:k2] := 0 -FOR i := 0 TO 15 - FOR j := 0 TO 15 - match := (a.dword[i] == b.dword[j] ? 1 : 0) - MEM[k1+15:k1].bit[i] |= match - MEM[k2+15:k2].bit[j] |= match - ENDFOR + Compare + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 ENDFOR +k[MAX:16] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VP2INTERSECT + Compare + + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + IF k1[j] + k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512_FP16 AVX512VL - Mask - - - - - - Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. +
immintrin.h
+ Compare +
+ + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 1 - FOR j := 0 TO 1 - match := (a.qword[i] == b.qword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) ENDFOR +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - AVX512_VP2INTERSECT + + AVX512_FP16 AVX512VL - Mask - - - - - - Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 3 - FOR j := 0 TO 3 - match := (a.qword[i] == b.qword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - AVX512_VP2INTERSECT - AVX512F - Mask - - - - - - Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + Convert + + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 7 - FOR j := 0 TO 7 - match := (a.qword[i] == b.qword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - - - Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start". + Convert + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". -tmp[511:0] := a -dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - - Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control". + Convert + + + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -start := control[7:0] -len := control[15:8] -tmp[511:0] := a -dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - - - Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start". + Convert + + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp[511:0] := a -dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - - Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control".. + Convert + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". -start := control[7:0] -len := control[15:8] -tmp[511:0] := a -dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Extract the lowest set bit from unsigned 32-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a". + Convert + + + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst := (-a) AND a +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Extract the lowest set bit from unsigned 64-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a". + Convert + + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst := (-a) AND a +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 32-bit integer "a". + Convert + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". -dst := (a - 1) XOR a +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 64-bit integer "a". + Convert + + + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst := (a - 1) XOR a +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a". + Convert + + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst := (a - 1) AND a +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a". + Convert + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. -dst := (a - 1) AND a +FOR j := 0 TO 3 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:64] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - - Compute the bitwise NOT of 32-bit integer "a" and then AND with b, and store the results in dst. + Convert + + + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -dst[31:0] := ((NOT a[31:0]) AND b[31:0]) +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:64] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - - Compute the bitwise NOT of 64-bit integer "a" and then AND with b, and store the results in dst. + Convert + + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -dst[63:0] := ((NOT a[63:0]) AND b[63:0]) +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:64] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". + Convert + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". -tmp := 0 -dst := 0 -DO WHILE ((tmp < 32) AND a[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 -OD +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". + Convert + + + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp := 0 -dst := 0 -DO WHILE ((tmp < 64) AND a[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 -OD +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". + Convert + + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp := 0 -dst := 0 -DO WHILE ((tmp < 32) AND a[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 -OD +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI1 - Bit Manipulation - - - Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". + Convert + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. -tmp := 0 -dst := 0 -DO WHILE ((tmp < 64) AND a[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 -OD +FOR j := 0 TO 3 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:64] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI2 - Bit Manipulation - - - - Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index". + Convert + + + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -n := index[7:0] -dst := a -IF (n < 32) - dst[31:n] := 0 -FI +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:64] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI2 - Bit Manipulation - - - - Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index". + Convert + + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -n := index[7:0] -dst := a -IF (n < 64) - dst[63:n] := 0 -FI +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:64] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI2 - Bit Manipulation - - - - Deposit contiguous low bits from unsigned 32-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero. + Convert + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". -tmp := a -dst := 0 -m := 0 -k := 0 -DO WHILE m < 32 - IF mask[m] == 1 - dst[m] := tmp[k] - k := k + 1 - FI - m := m + 1 -OD +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI2 - Bit Manipulation - - - - Deposit contiguous low bits from unsigned 64-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero. + Convert + + + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -tmp := a -dst := 0 -m := 0 -k := 0 -DO WHILE m < 64 - IF mask[m] == 1 - dst[m] := tmp[k] - k := k + 1 +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] FI - m := m + 1 -OD +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI2 - Bit Manipulation - - - - Extract bits from unsigned 32-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero. + Convert + + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -tmp := a -dst := 0 -m := 0 -k := 0 -DO WHILE m < 32 - IF mask[m] == 1 - dst[k] := tmp[m] - k := k + 1 +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 FI - m := m + 1 -OD +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI2 - Bit Manipulation - - - - Extract bits from unsigned 64-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero. + Convert + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out. -tmp := a -dst := 0 -m := 0 -k := 0 -DO WHILE m < 64 - IF mask[m] == 1 - dst[k] := tmp[m] - k := k + 1 - FI - m := m + 1 -OD +FOR j := 0 TO 1 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:32] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI2 - Arithmetic - - - - - Multiply unsigned 32-bit integers "a" and "b", store the low 32-bits of the result in "dst", and store the high 32-bits in "hi". This does not read or write arithmetic flags. + Convert + + + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. -dst[31:0] := (a * b)[31:0] -MEM[hi+31:hi] := (a * b)[63:32] +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:32] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - BMI2 - Arithmetic - - - - - Multiply unsigned 64-bit integers "a" and "b", store the low 64-bits of the result in "dst", and store the high 64-bits in "hi". This does not read or write arithmetic flags. + Convert + + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. -dst[63:0] := (a * b)[63:0] -MEM[hi+63:hi] := (a * b)[127:64] +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:32] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - CET_SS - Miscellaneous - - - Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a". + Convert + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. -SSP := SSP + a[7:0] * 4 +FOR j := 0 TO 3 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:64] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - CET_SS - Miscellaneous - - - Increment the shadow stack pointer by 8 times the value specified in bits [7:0] of "a". + Convert + + + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -SSP := SSP + a[7:0] * 8 +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:64] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - CET_SS - Miscellaneous - - - Read the low 32-bits of the current shadow stack pointer, and store the result in "dst". - dst := SSP[31:0] + Convert + + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:64] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - CET_SS - Miscellaneous - - - Read the current shadow stack pointer, and store the result in "dst". - dst := SSP[63:0] + Convert + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:32] := 0 - -
immintrin.h
-
- - CET_SS - Miscellaneous - - - Save the previous shadow stack pointer context. - -
immintrin.h
-
- - CET_SS - Miscellaneous - - - Restore the saved shadow stack pointer from the shadow stack restore token previously created on shadow stack by saveprevssp. - -
immintrin.h
-
- - CET_SS - Miscellaneous - - - - Write 32-bit value in "val" to a shadow stack page in memory specified by "p". - -
immintrin.h
-
- - CET_SS - Miscellaneous - - - - Write 64-bit value in "val" to a shadow stack page in memory specified by "p". - -
immintrin.h
-
- - CET_SS - Miscellaneous - - - - Write 32-bit value in "val" to a user shadow stack page in memory specified by "p". - -
immintrin.h
-
- - CET_SS - Miscellaneous - - - - Write 64-bit value in "val" to a user shadow stack page in memory specified by "p". - -
immintrin.h
-
- - CET_SS - Miscellaneous - - - Mark shadow stack pointed to by IA32_PL0_SSP as busy. - -
immintrin.h
-
- - CET_SS - Miscellaneous - - - Mark shadow stack pointed to by "p" as not busy. - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - CET_SS - Miscellaneous - - - If CET is enabled, read the low 32-bits of the current shadow stack pointer, and store the result in "dst". Otherwise return 0. - dst := SSP[31:0] + Convert + + + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:32] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - CET_SS - Miscellaneous - - - If CET is enabled, read the current shadow stack pointer, and store the result in "dst". Otherwise return 0. - dst := SSP[63:0] + Convert + + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:32] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - CET_SS - Miscellaneous - - - Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a". + Convert + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. -SSP := SSP + a[7:0] * 4 +FOR j := 0 TO 3 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:64] := 0 - -
immintrin.h
-
- - CLDEMOTE - Miscellaneous - - - Hint to hardware that the cache line that contains "p" should be demoted from the cache closest to the processor core to a level more distant from the processor core. - -
immintrin.h
-
- - CLFLUSHOPT - General Support - - - Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy. - -
immintrin.h
-
- - CLWB - General Support - - - Write back to memory the cache line that contains "p" from any level of the cache hierarchy in the cache coherence domain. - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + Convert + + + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + Convert + + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + Convert + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out. -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +FOR j := 0 TO 1 + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:32] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + Convert + + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:32] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + Convert + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. -dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:32] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + Convert + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. -dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 TO 3 + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) +ENDFOR +dst[MAX:64] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + Convert + + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -FOR j := 0 to 1 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst.fp16[j] := src.fp16[j] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + Convert + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -FOR j := 0 to 3 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + dst.fp16[j] := 0 FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + Convert + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. FOR j := 0 to 3 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -FOR j := 0 to 7 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +FOR j := 0 to 3 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + dst.fp16[j] := src.fp16[j] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:64] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +FOR j := 0 to 3 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:64] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + Convert + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +FOR j := 0 to 7 + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + Convert + + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI ENDFOR dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + Convert + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := 0 + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". -dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -dst[127:64] := a[127:64] +FOR j := 0 TO 3 + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) +ENDFOR dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -dst[127:32] := a[127:32] +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + dst.dword[j] := 0 FI ENDFOR dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". -FOR j := 0 to 3 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI +FOR j := 0 TO 7 + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) ENDFOR dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst.dword[j] := src.dword[j] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + dst.dword[j] := 0 FI ENDFOR dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR +FOR j := 0 TO 3 + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) +ENDFOR dst[MAX:128] := 0 - - - -
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - - -
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -dst[127:64] := a[127:64] +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". -dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 TO 7 + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:128] := 0 +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR +FOR j := 0 TO 3 + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) +ENDFOR dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR -dst[MAX:256] := 0 +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -dst[127:64] := a[127:64] +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FMA - Arithmetic - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". -dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +FOR j := 0 TO 7 + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FP16C Convert - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*32 - m := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI ENDFOR dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FP16C Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [sae_note] + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := 16*j - l := 32*j - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - FP16C Convert - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - m := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) +FOR j := 0 TO 3 + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) ENDFOR dst[MAX:128] := 0 - -
emmintrin.h
-
- - Floating Point - FP16C + + AVX512_FP16 + AVX512VL +
immintrin.h
Convert - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [sae_note] +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := 16*j - l := 32*j - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI ENDFOR -dst[MAX:64] := 0 - - -
emmintrin.h
-
- - Integer - FSGSBASE - General Support - - Read the FS segment base register and store the 32-bit result in "dst". - dst[31:0] := FS_Segment_Base_Register -dst[63:32] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - FSGSBASE - General Support - - Read the FS segment base register and store the 64-bit result in "dst". - dst[63:0] := FS_Segment_Base_Register + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - FSGSBASE - General Support - - Read the GS segment base register and store the 32-bit result in "dst". - dst[31:0] := GS_Segment_Base_Register -dst[63:32] := 0 + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - FSGSBASE - General Support - - Read the GS segment base register and store the 64-bit result in "dst". - dst[63:0] := GS_Segment_Base_Register + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - FSGSBASE - General Support - - - Write the unsigned 32-bit integer "a" to the FS segment base register. + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FS_Segment_Base_Register[31:0] := a[31:0] -FS_Segment_Base_Register[63:32] := 0 +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - FSGSBASE - General Support - - - Write the unsigned 64-bit integer "a" to the FS segment base register. + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". -FS_Segment_Base_Register[63:0] := a[63:0] +FOR j := 0 TO 1 + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - FSGSBASE - General Support - - - Write the unsigned 32-bit integer "a" to the GS segment base register. + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -GS_Segment_Base_Register[31:0] := a[31:0] -GS_Segment_Base_Register[63:32] := 0 +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - FSGSBASE - General Support - - - Write the unsigned 64-bit integer "a" to the GS segment base register. + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -GS_Segment_Base_Register[63:0] := a[63:0] +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - FXSR - OS-Targeted - - - Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary. - state_x87_fpu_mmx_sse := fxrstor(MEM[mem_addr+512*8:mem_addr]) + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 TO 3 + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - FXSR - OS-Targeted - - - Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary. - state_x87_fpu_mmx_sse := fxrstor64(MEM[mem_addr+512*8:mem_addr]) + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - FXSR - OS-Targeted - - - Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor. - MEM[mem_addr+512*8:mem_addr] := fxsave(state_x87_fpu_mmx_sse) + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - FXSR - OS-Targeted - - - Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor. - MEM[mem_addr+512*8:mem_addr] := fxsave64(state_x87_fpu_mmx_sse) + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 1 + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI - AVX512F - Arithmetic - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 63 +FOR j := 0 TO 1 IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) ELSE - dst.byte[j] := 0 + dst.qword[j] := src.qword[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI - AVX512F - Arithmetic - - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 63 +FOR j := 0 TO 1 IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) ELSE - dst.byte[j] := src.byte[j] + dst.qword[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI - AVX512F - Arithmetic - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 63 - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) +FOR j := 0 TO 3 + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - GFNI + + AVX512_FP16 AVX512VL - Arithmetic - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 31 +FOR j := 0 TO 3 IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) ELSE - dst.byte[j] := 0 + dst.qword[j] := src.qword[j] FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - GFNI + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 31 +FOR j := 0 TO 3 IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) ELSE - dst.byte[j] := src.byte[j] + dst.qword[j] := 0 FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - GFNI + + AVX512_FP16 AVX512VL - Arithmetic - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 31 - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) +FOR j := 0 TO 1 + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) ENDFOR -dst[MAX:256] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - GFNI + + AVX512_FP16 AVX512VL - Arithmetic - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 15 +FOR j := 0 TO 1 IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) ELSE - dst.byte[j] := 0 + dst.qword[j] := src.qword[j] FI ENDFOR dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 15 +FOR j := 0 TO 3 + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) ELSE - dst.byte[j] := src.byte[j] + dst.qword[j] := src.qword[j] FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - GFNI + + AVX512_FP16 AVX512VL - Arithmetic - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 15 - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI - AVX512F - Arithmetic - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR +FOR j := 0 TO 1 + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI - AVX512F - Arithmetic - - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI - AVX512F - Arithmetic - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - -
immintrin.h
-
- - Integer - GFNI + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} FOR j := 0 TO 3 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - GFNI + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} FOR j := 0 TO 3 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - GFNI + + AVX512_FP16 AVX512VL - Arithmetic - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} FOR j := 0 TO 3 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - GFNI + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR +FOR j := 0 TO 7 + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) ENDFOR dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI ENDFOR dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 15 + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 AVX512VL - Arithmetic - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI - AVX512F - Arithmetic - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI - AVX512F - Arithmetic - - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". + FOR j := 0 TO 7 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[b] - FI - ENDFOR + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI - AVX512F - Arithmetic - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 TO 7 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 3 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 15 + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) ENDFOR dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 3 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI ENDFOR dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 7 + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 AVX512VL - Arithmetic - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 3 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 15 + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) ENDFOR dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) ENDFOR dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 AVX512VL - Arithmetic - - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI ENDFOR dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - GFNI + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 15 + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 AVX512VL - Arithmetic - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI ENDFOR -dst[MAX:128] := 0 +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - INVPCID - OS-Targeted - - - - Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by "descriptor" based on the invalidation type specified in "type". - The PCID "descriptor" is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved. - The types supported are: - 0) Individual-address invalidation: If "type" is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in "descriptor", except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs. - 1) Single-context invalidation: If "type" is 1, the logical processor invalidates all mappings tagged with the PCID specified in "descriptor" except global translations. In some cases, it may invalidate mappings for other PCIDs as well. - 2) All-context invalidation: If "type" is 2, the logical processor invalidates all mappings tagged with any PCID. - 3) All-context invalidation, retaining global translations: If "type" is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring "descriptor". The instruction may also invalidate global translations as well. + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -CASE type[1:0] OF -0: // individual-address invalidation retaining global translations - OP_PCID := MEM[descriptor+11:descriptor] - ADDR := MEM[descriptor+127:descriptor+64] - BREAK -1: // single PCID invalidation retaining globals - OP_PCID := MEM[descriptor+11:descriptor] - // invalidate all mappings tagged with OP_PCID except global translations - BREAK -2: // all PCID invalidation - // invalidate all mappings tagged with any PCID - BREAK -3: // all PCID invalidation retaining global translations - // invalidate all mappings tagged with any PCID except global translations - BREAK -ESAC +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - KNCNI - General Support - - - - Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i". - - - - - - - - -
xmmintrin.h
-
- - Mask - KNCNI - Mask - - - - Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". -k[15:0] := (NOT a[15:0]) AND b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 1 + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -k[15:0] := a[15:0] AND b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 1 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := src.fp64[j] + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Mask - KNCNI - Mask - - - Copy 16-bit mask "a" to "k". + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -k[15:0] := a[15:0] -k[MAX:16] := 0 +FOR j := 0 to 1 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Mask - KNCNI - Mask - - - Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". -k[15:0] := NOT a[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -k[15:0] := a[15:0] OR b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := src.fp64[j] + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -k[15:0] := NOT (a[15:0] XOR b[15:0]) -k[MAX:16] := 0 +FOR j := 0 to 3 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". -k[15:0] := a[15:0] XOR b[15:0] -k[MAX:16] := 0 +FOR j := 0 to 3 + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - Mask - KNCNI - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 +FOR j := 0 to 3 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := src.fp32[j] + FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - Mask - KNCNI - Compare - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 +FOR j := 0 to 3 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := 0 FI ENDFOR -k[MAX:16] := 0 +dst[MAX:128] := 0 - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to single-precision (32-bit) floating-point elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] -FOR j := 0 to 15 - i := j*32 - CASE bc OF - _MM_BROADCAST32_NONE: - CASE conv OF - _MM_UPCONV_PS_NONE: - n := j*32 - dst[i+31:i] := addr[n+31:n] - _MM_UPCONV_PS_FLOAT16: - n := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n]) - _MM_UPCONV_PS_UINT8: - n := j*8 - dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n]) - _MM_UPCONV_PS_SINT8: - n := j*8 - dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n]) - _MM_UPCONV_PS_UINT16: - n := j*16 - dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n]) - _MM_UPCONV_PS_SINT16: - n := j*16 - dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n]) - ESAC - _MM_BROADCAST_1X16: - CASE conv OF - _MM_UPCONV_PS_NONE: - n := j*32 - dst[i+31:i] := addr[31:0] - _MM_UPCONV_PS_FLOAT16: - n := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(addr[15:0]) - _MM_UPCONV_PS_UINT8: - n := j*8 - dst[i+31:i] := Convert_UInt8_To_FP32(addr[7:0]) - _MM_UPCONV_PS_SINT8: - n := j*8 - dst[i+31:i] := Convert_Int8_To_FP32(addr[7:0]) - _MM_UPCONV_PS_UINT16: - n := j*16 - dst[i+31:i] := Convert_UInt16_To_FP32(addr[15:0]) - _MM_UPCONV_PS_SINT16: - n := j*16 - dst[i+31:i] := Convert_Int16_To_FP32(addr[15:0]) - ESAC - _MM_BROADCAST_4X16: - mod := j%4 - CASE conv OF - _MM_UPCONV_PS_NONE: - n := mod*32 - dst[i+31:i] := addr[n+31:n] - _MM_UPCONV_PS_FLOAT16: - n := mod*16 - dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n]) - _MM_UPCONV_PS_UINT8: - n := mod*8 - dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n]) - _MM_UPCONV_PS_SINT8: - n := mod*8 - dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n]) - _MM_UPCONV_PS_UINT16: - n := mod*16 - dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n]) - _MM_UPCONV_PS_SINT16: - n := mod*16 - dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n]) - ESAC - ESAC + Convert + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - - - Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to single-precision (32-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] -FOR j := 0 to 15 - i := j*32 + Convert + + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 IF k[j] - CASE bc OF - _MM_BROADCAST32_NONE: - CASE conv OF - _MM_UPCONV_PS_NONE: - n := j*32 - dst[i+31:i] := addr[n+31:n] - _MM_UPCONV_PS_FLOAT16: - n := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n]) - _MM_UPCONV_PS_UINT8: - n := j*8 - dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n]) - _MM_UPCONV_PS_SINT8: - n := j*8 - dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n]) - _MM_UPCONV_PS_UINT16: - n := j*16 - dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n]) - _MM_UPCONV_PS_SINT16: - n := j*16 - dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n]) - ESAC - _MM_BROADCAST_1X16: - CASE conv OF - _MM_UPCONV_PS_NONE: - n := j*32 - dst[i+31:i] := addr[31:0] - _MM_UPCONV_PS_FLOAT16: - n := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(addr[15:0]) - _MM_UPCONV_PS_UINT8: - n := j*8 - dst[i+31:i] := Convert_UInt8_To_FP32(addr[7:0]) - _MM_UPCONV_PS_SINT8: - n := j*8 - dst[i+31:i] := Convert_Int8_To_FP32(addr[7:0]) - _MM_UPCONV_PS_UINT16: - n := j*16 - dst[i+31:i] := Convert_UInt16_To_FP32(addr[15:0]) - _MM_UPCONV_PS_SINT16: - n := j*16 - dst[i+31:i] := Convert_Int16_To_FP32(addr[15:0]) - ESAC - _MM_BROADCAST_4X16: - mod := j%4 - CASE conv OF - _MM_UPCONV_PS_NONE: - n := mod*32 - dst[i+31:i] := addr[n+31:n] - _MM_UPCONV_PS_FLOAT16: - n := mod*16 - dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n]) - _MM_UPCONV_PS_UINT8: - n := mod*8 - dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n]) - _MM_UPCONV_PS_SINT8: - n := mod*8 - dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n]) - _MM_UPCONV_PS_UINT16: - n := mod*16 - dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n]) - _MM_UPCONV_PS_SINT16: - n := mod*16 - dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n]) - ESAC - ESAC + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) ELSE - dst[i+31:i] := src[i+31:i] + dst.fp32[j] := src.fp32[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 32-bit integer elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] -FOR j := 0 to 15 - i := j*32 - CASE bc OF - _MM_BROADCAST32_NONE: - CASE conv OF - _MM_UPCONV_EPI32_NONE: - n := j*32 - dst[i+31:i] := addr[n+31:n] - _MM_UPCONV_EPI32_UINT8: - n := j*8 - dst[i+31:i] := ZeroExtend32(addr[n+7:n]) - _MM_UPCONV_EPI32_SINT8: - n := j*8 - dst[i+31:i] := SignExtend32(addr[n+7:n]) - _MM_UPCONV_EPI32_UINT16: - n := j*16 - dst[i+31:i] := ZeroExtend32(addr[n+15:n]) - _MM_UPCONV_EPI32_SINT16: - n := j*16 - dst[i+31:i] := SignExtend32(addr[n+15:n]) - ESAC - _MM_BROADCAST_1X16: - CASE conv OF - _MM_UPCONV_EPI32_NONE: - n := j*32 - dst[i+31:i] := addr[31:0] - _MM_UPCONV_EPI32_UINT8: - n := j*8 - dst[i+31:i] := ZeroExtend32(addr[7:0]) - _MM_UPCONV_EPI32_SINT8: - n := j*8 - dst[i+31:i] := SignExtend32(addr[7:0]) - _MM_UPCONV_EPI32_UINT16: - n := j*16 - dst[i+31:i] := ZeroExtend32(addr[15:0]) - _MM_UPCONV_EPI32_SINT16: - n := j*16 - dst[i+31:i] := SignExtend32(addr[15:0]) - ESAC - _MM_BROADCAST_4X16: - mod := j%4 - CASE conv OF - _MM_UPCONV_EPI32_NONE: - n := mod*32 - dst[i+31:i] := addr[n+31:n] - _MM_UPCONV_EPI32_UINT8: - n := mod*8 - dst[i+31:i] := ZeroExtend32(addr[n+7:n]) - _MM_UPCONV_EPI32_SINT8: - n := mod*8 - dst[i+31:i] := SignExtend32(addr[n+7:n]) - _MM_UPCONV_EPI32_UINT16: - n := mod*16 - dst[i+31:i] := ZeroExtend32(addr[n+15:n]) - _MM_UPCONV_EPI32_SINT16: - n := mod*16 - dst[i+31:i] := SignExtend32(addr[n+15:n]) - ESAC - ESAC + Convert + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - - - Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 32-bit integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] + Convert + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 - i := j*32 IF k[j] - CASE bc OF - _MM_BROADCAST32_NONE: - CASE conv OF - _MM_UPCONV_EPI32_NONE: - n := j*32 - dst[i+31:i] := addr[n+31:n] - _MM_UPCONV_EPI32_UINT8: - n := j*8 - dst[i+31:i] := ZeroExtend32(addr[n+7:n]) - _MM_UPCONV_EPI32_SINT8: - n := j*8 - dst[i+31:i] := SignExtend32(addr[n+7:n]) - _MM_UPCONV_EPI32_UINT16: - n := j*16 - dst[i+31:i] := ZeroExtend32(addr[n+15:n]) - _MM_UPCONV_EPI32_SINT16: - n := j*16 - dst[i+31:i] := SignExtend32(addr[n+15:n]) - ESAC - _MM_BROADCAST_1X16: - CASE conv OF - _MM_UPCONV_EPI32_NONE: - n := j*32 - dst[i+31:i] := addr[31:0] - _MM_UPCONV_EPI32_UINT8: - n := j*8 - dst[i+31:i] := ZeroExtend32(addr[7:0]) - _MM_UPCONV_EPI32_SINT8: - n := j*8 - dst[i+31:i] := SignExtend32(addr[7:0]) - _MM_UPCONV_EPI32_UINT16: - n := j*16 - dst[i+31:i] := ZeroExtend32(addr[15:0]) - _MM_UPCONV_EPI32_SINT16: - n := j*16 - dst[i+31:i] := SignExtend32(addr[15:0]) - ESAC - _MM_BROADCAST_4X16: - mod := j%4 - CASE conv OF - _MM_UPCONV_EPI32_NONE: - n := mod*32 - dst[i+31:i] := addr[n+31:n] - _MM_UPCONV_EPI32_UINT8: - n := mod*8 - dst[i+31:i] := ZeroExtend32(addr[n+7:n]) - _MM_UPCONV_EPI32_SINT8: - n := mod*8 - dst[i+31:i] := SignExtend32(addr[n+7:n]) - _MM_UPCONV_EPI32_UINT16: - n := mod*16 - dst[i+31:i] := ZeroExtend32(addr[n+15:n]) - _MM_UPCONV_EPI32_SINT16: - n := mod*16 - dst[i+31:i] := SignExtend32(addr[n+15:n]) - ESAC - ESAC + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) ELSE - dst[i+31:i] := src[i+31:i] + dst.fp16[j] := src.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to double-precision (64-bit) floating-point elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] + Special Math Functions + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note] + +dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". + FOR j := 0 to 7 - i := j*64 - CASE bc OF - _MM_BROADCAST64_NONE: - CASE conv OF - _MM_UPCONV_PD_NONE: - n := j*64 - dst[i+63:i] := addr[n+63:n] - ESAC - _MM_BROADCAST_1X8: - CASE conv OF - _MM_UPCONV_PD_NONE: - n := j*64 - dst[i+63:i] := addr[63:0] - ESAC - _MM_BROADCAST_4X8: - mod := j%4 - CASE conv OF - _MM_UPCONV_PD_NONE: - n := mod*64 - dst[i+63:i] := addr[n+63:n] - ESAC - ESAC + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - - - Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] + Special Math Functions + + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 - i := j*64 IF k[j] - CASE bc OF - _MM_BROADCAST64_NONE: - CASE conv OF - _MM_UPCONV_PD_NONE: - n := j*64 - dst[i+63:i] := addr[n+63:n] - ESAC - _MM_BROADCAST_1X8: - CASE conv OF - _MM_UPCONV_PD_NONE: - n := j*64 - dst[i+63:i] := addr[63:0] - ESAC - _MM_BROADCAST_4X8: - mod := j%4 - CASE conv OF - _MM_UPCONV_PD_NONE: - n := mod*64 - dst[i+63:i] := addr[n+63:n] - ESAC - ESAC + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) ELSE - dst[i+63:i] := src[i+63:i] + dst.fp16[j] := src.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 64-bit integer elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] + Special Math Functions + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 - i := j*64 - CASE bc OF - _MM_BROADCAST64_NONE: - CASE conv OF - _MM_UPCONV_EPI64_NONE: - n := j*64 - dst[i+63:i] := addr[n+63:n] - ESAC - _MM_BROADCAST_1X8: - CASE conv OF - _MM_UPCONV_EPI64_NONE: - n := j*64 - dst[i+63:i] := addr[63:0] - ESAC - _MM_BROADCAST_4X8: - mod := j%4 - CASE conv OF - _MM_UPCONV_EPI64_NONE: - n := mod*64 - dst[i+63:i] := addr[n+63:n] - ESAC - ESAC + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:128] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - - - Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 64-bit integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] -FOR j := 0 to 7 - i := j*64 + Special Math Functions + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 IF k[j] - CASE bc OF - _MM_BROADCAST64_NONE: - CASE conv OF - _MM_UPCONV_EPI64_NONE: - n := j*64 - dst[i+63:i] := addr[n+63:n] - ESAC - _MM_BROADCAST_1X8: - CASE conv OF - _MM_UPCONV_EPI64_NONE: - n := j*64 - dst[i+63:i] := addr[63:0] - ESAC - _MM_BROADCAST_4X8: - mod := j%4 - CASE conv OF - _MM_UPCONV_EPI64_NONE: - n := mod*64 - dst[i+63:i] := addr[n+63:n] - ESAC - ESAC + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) ELSE - dst[i+63:i] := src[i+63:i] + dst.fp16[j] := src.fp16[j] FI ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - - - + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - KNCNI - Swizzle - - - - Performs a swizzle transformation of each of the four groups of packed 4xsingle-precision (32-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst". - CASE s OF -_MM_SWIZ_REG_NONE: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_DCBA: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_CDAB: - FOR j := 0 to 7 - i := j*64 - dst[i+31:i] := v[i+63:i+32] - dst[i+63:i+32] := v[i+31:i] - ENDFOR -_MM_SWIZ_REG_BADC: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+95:i+64] - dst[i+63:i+32] := v[i+127:i+96] - dst[i+95:i+64] := v[i+31:i] - dst[i+127:i+96] := v[i+63:i+32] - ENDFOR -_MM_SWIZ_REG_AAAA: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+31:i] - dst[i+63:i+32] := v[i+31:i] - dst[i+95:i+64] := v[i+31:i] - dst[i+127:i+96] := v[i+31:i] - ENDFOR -_MM_SWIZ_REG_BBBB: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+63:i+32] - dst[i+63:i+32] := v[i+63:i+32] - dst[i+95:i+64] := v[i+63:i+32] - dst[i+127:i+96] := v[i+63:i+32] - ENDFOR -_MM_SWIZ_REG_CCCC: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+95:i+64] - dst[i+63:i+32] := v[i+95:i+64] - dst[i+95:i+64] := v[i+95:i+64] - dst[i+127:i+96] := v[i+95:i+64] - ENDFOR -_MM_SWIZ_REG_DDDD: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+127:i+96] - dst[i+63:i+32] := v[i+127:i+96] - dst[i+95:i+64] := v[i+127:i+96] - dst[i+127:i+96] := v[i+127:i+96] - ENDFOR -_MM_SWIZ_REG_DACB: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+63:i+32] - dst[i+63:i+32] := v[i+95:i+64] - dst[i+95:i+64] := v[i+31:i] - dst[i+127:i+96] := v[i+127:i+96] - ENDFOR -ESAC -dst[MAX:512] := 0 + Special Math Functions + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - KNCNI - Swizzle - - - - Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst". - CASE s OF -_MM_SWIZ_REG_NONE: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_DCBA: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_CDAB: - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := v[i+127:i+64] - dst[i+127:i+64] := v[i+63:i] - ENDFOR -_MM_SWIZ_REG_BADC: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+191:i+128] - dst[i+127:i+64] := v[i+255:i+192] - dst[i+191:i+128] := v[i+63:i] - dst[i+255:i+192] := v[i+127:i+64] - ENDFOR -_MM_SWIZ_REG_AAAA: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+63:i] - dst[i+127:i+64] := v[i+63:i] - dst[i+191:i+128] := v[i+63:i] - dst[i+255:i+192] := v[i+63:i] - ENDFOR -_MM_SWIZ_REG_BBBB: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+127:i+63] - dst[i+127:i+64] := v[i+127:i+63] - dst[i+191:i+128] := v[i+127:i+63] - dst[i+255:i+192] := v[i+127:i+63] - ENDFOR -_MM_SWIZ_REG_CCCC: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+191:i+128] - dst[i+127:i+64] := v[i+191:i+128] - dst[i+191:i+128] := v[i+191:i+128] - dst[i+255:i+192] := v[i+191:i+128] - ENDFOR -_MM_SWIZ_REG_DDDD: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+255:i+192] - dst[i+127:i+64] := v[i+255:i+192] - dst[i+191:i+128] := v[i+255:i+192] - dst[i+255:i+192] := v[i+255:i+192] - ENDFOR -_MM_SWIZ_REG_DACB: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+127:i+64] - dst[i+127:i+64] := v[i+191:i+128] - dst[i+191:i+128] := v[i+63:i] - dst[i+255:i+192] := v[i+255:i+192] - ENDFOR -ESAC -dst[MAX:512] := 0 + Special Math Functions + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - KNCNI - Swizzle - - - - Performs a swizzle transformation of each of the four groups of packed 4x 32-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst". - CASE s OF -_MM_SWIZ_REG_NONE: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_DCBA: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_CDAB: - FOR j := 0 to 7 - i := j*64 - dst[i+31:i] := v[i+63:i+32] - dst[i+63:i+32] := v[i+31:i] - ENDFOR -_MM_SWIZ_REG_BADC: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+95:i+64] - dst[i+63:i+32] := v[i+127:i+96] - dst[i+95:i+64] := v[i+31:i] - dst[i+127:i+96] := v[i+63:i+32] - ENDFOR -_MM_SWIZ_REG_AAAA: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+31:i] - dst[i+63:i+32] := v[i+31:i] - dst[i+95:i+64] := v[i+31:i] - dst[i+127:i+96] := v[i+31:i] - ENDFOR -_MM_SWIZ_REG_BBBB: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+63:i+32] - dst[i+63:i+32] := v[i+63:i+32] - dst[i+95:i+64] := v[i+63:i+32] - dst[i+127:i+96] := v[i+63:i+32] - ENDFOR -_MM_SWIZ_REG_CCCC: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+95:i+64] - dst[i+63:i+32] := v[i+95:i+64] - dst[i+95:i+64] := v[i+95:i+64] - dst[i+127:i+96] := v[i+95:i+64] - ENDFOR -_MM_SWIZ_REG_DDDD: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+127:i+96] - dst[i+63:i+32] := v[i+127:i+96] - dst[i+95:i+64] := v[i+127:i+96] - dst[i+127:i+96] := v[i+127:i+96] - ENDFOR -_MM_SWIZ_REG_DACB: - FOR j := 0 to 3 - i := j*128 - dst[i+31:i] := v[i+63:i+32] - dst[i+63:i+32] := v[i+95:i+64] - dst[i+95:i+64] := v[i+31:i] - dst[i+127:i+96] := v[i+127:i+96] - ENDFOR -ESAC -dst[MAX:512] := 0 + Special Math Functions + + + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - KNCNI - Swizzle - - - - Performs a swizzle transformation of each of the two groups of packed 4x64-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst". - CASE s OF -_MM_SWIZ_REG_NONE: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_DCBA: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_CDAB: - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := v[i+127:i+64] - dst[i+127:i+64] := v[i+63:i] - ENDFOR -_MM_SWIZ_REG_BADC: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+191:i+128] - dst[i+127:i+64] := v[i+255:i+192] - dst[i+191:i+128] := v[i+63:i] - dst[i+255:i+192] := v[i+127:i+64] - ENDFOR -_MM_SWIZ_REG_AAAA: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+63:i] - dst[i+127:i+64] := v[i+63:i] - dst[i+191:i+128] := v[i+63:i] - dst[i+255:i+192] := v[i+63:i] - ENDFOR -_MM_SWIZ_REG_BBBB: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+127:i+63] - dst[i+127:i+64] := v[i+127:i+63] - dst[i+191:i+128] := v[i+127:i+63] - dst[i+255:i+192] := v[i+127:i+63] - ENDFOR -_MM_SWIZ_REG_CCCC: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+191:i+128] - dst[i+127:i+64] := v[i+191:i+128] - dst[i+191:i+128] := v[i+191:i+128] - dst[i+255:i+192] := v[i+191:i+128] - ENDFOR -_MM_SWIZ_REG_DDDD: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+255:i+192] - dst[i+127:i+64] := v[i+255:i+192] - dst[i+191:i+128] := v[i+255:i+192] - dst[i+255:i+192] := v[i+255:i+192] - ENDFOR -_MM_SWIZ_REG_DACB: - FOR j := 0 to 1 - i := j*256 - dst[i+63:i] := v[i+127:i+64] - dst[i+127:i+64] := v[i+191:i+128] - dst[i+191:i+128] := v[i+63:i] - dst[i+255:i+192] := v[i+255:i+192] - ENDFOR -ESAC -dst[MAX:512] := 0 + Special Math Functions + + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - KNCNI - Swizzle - - - - - - Performs a swizzle transformation of each of the four groups of packed 4x single-precision (32-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - CASE s OF -_MM_SWIZ_REG_NONE: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_DCBA: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_CDAB: - FOR j := 0 to 7 - i := j*64 - IF k[j*2] - dst[i+31:i] := v[i+63:i+32] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*2+1] - dst[i+63:i+32] := v[i+31:i] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - ENDFOR -_MM_SWIZ_REG_BADC: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+95:i+64] - ELSE - dst[i+31:i] := src[i+31:i] + Special Math Functions + + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note] + +dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 7 + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) +ENDFOR +dest[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dest[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dest[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 15 + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) +ENDFOR +dest[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dest[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dest[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 7 + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 15 + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 7 + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 7 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 7 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 15 + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 15 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 15 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 7 + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 15 + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+127:i+96] - ELSE - dst[i+63:i+32] := src[i+63:i+32] + IF denormal2 + tmp2 := 0 FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+31:i] - ELSE - dst[i+95:i+64] := src[i+95:i+64] + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 7 + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+63:i+32] - ELSE - dst[i+127:i+96] := src[i+127:i+96] + IF denormal2 + tmp2 := 0 FI - ENDFOR -_MM_SWIZ_REG_AAAA: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+31:i] - ELSE - dst[i+63:i+32] := src[i+63:i+32] + IF denormal2 + tmp2 := 0 FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+31:i] - ELSE - dst[i+95:i+64] := src[i+95:i+64] + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+31:i] - ELSE - dst[i+127:i+96] := src[i+127:i+96] + IF denormal2 + tmp2 := 0 FI - ENDFOR -_MM_SWIZ_REG_BBBB: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+63:i+32] - ELSE - dst[i+31:i] := src[i+31:i] + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+63:i+32] - ELSE - dst[i+63:i+32] := src[i+63:i+32] + IF denormal2 + tmp2 := 0 FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+63:i+32] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+63:i+32] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -_MM_SWIZ_REG_CCCC: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+95:i+64] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+95:i+64] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+95:i+64] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+95:i+64] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -_MM_SWIZ_REG_DDDD: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+127:i+96] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+127:i+96] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+127:i+96] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+127:i+96] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -_MM_SWIZ_REG_DACB: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+63:i+32] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+95:i+64] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+31:i] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+127:i+96] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -ESAC -dst[MAX:512] := 0 + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Floating Point - KNCNI - Swizzle - - - - - - Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - CASE s OF -_MM_SWIZ_REG_NONE: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_DCBA: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_CDAB: - FOR j := 0 to 3 - i := j*64 - IF k[j*2] - dst[i+63:i] := v[i+127:i+64] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*2+1] - dst[i+127:i+64] := v[i+63:i] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - ENDFOR -_MM_SWIZ_REG_BADC: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+191:i+128] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+255:i+192] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+63:i] - ELSE - dst[i+191:i+128] := src[i+191:i+128] - FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+127:i+64] - ELSE - dst[i+255:i+192] := src[i+255:i+192] - FI - ENDFOR -_MM_SWIZ_REG_AAAA: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+63:i] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+63:i] - ELSE - dst[i+191:i+128] := src[i+191:i+128] - FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+63:i] - ELSE - dst[i+255:i+192] := src[i+255:i+192] - FI - ENDFOR -_MM_SWIZ_REG_BBBB: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+127:i+63] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+127:i+63] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+127:i+63] - ELSE - dst[i+191:i+128] := src[i+191:i+128] - FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+127:i+63] - ELSE - dst[i+255:i+192] := src[i+255:i+192] - FI - ENDFOR -_MM_SWIZ_REG_CCCC: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+191:i+128] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+191:i+128] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+191:i+128] - ELSE - dst[i+191:i+128] := src[i+191:i+128] - FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+191:i+128] - ELSE - dst[i+255:i+192] := src[i+255:i+192] - FI - ENDFOR -_MM_SWIZ_REG_DDDD: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+255:i+192] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+255:i+192] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+255:i+192] - ELSE - dst[i+191:i+128] := src[i+191:i+128] - FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+255:i+192] - ELSE - dst[i+255:i+192] := src[i+255:i+192] - FI - ENDFOR -_MM_SWIZ_REG_DACB: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+127:i+64] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+191:i+128] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+63:i] - ELSE - dst[i+191:i+128] := src[i+191:i+128] + Miscellaneous + + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+255:i+192] - ELSE - dst[i+255:i+192] := src[i+255:i+192] + IF denormal2 + tmp2 := 0 FI - ENDFOR -ESAC -dst[MAX:512] := 0 + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - KNCNI - Swizzle - - - - - - Performs a swizzle transformation of each of the four groups of packed 4x32-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - CASE s OF -_MM_SWIZ_REG_NONE: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_DCBA: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_CDAB: - FOR j := 0 to 7 - i := j*64 - IF k[j*2] - dst[i+31:i] := v[i+63:i+32] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*2+1] - dst[i+63:i+32] := v[i+31:i] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - ENDFOR -_MM_SWIZ_REG_BADC: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+95:i+64] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+127:i+96] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+31:i] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+63:i+32] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -_MM_SWIZ_REG_AAAA: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+31:i] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+31:i] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+31:i] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -_MM_SWIZ_REG_BBBB: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+63:i+32] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+63:i+32] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+63:i+32] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+63:i+32] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -_MM_SWIZ_REG_CCCC: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+95:i+64] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+95:i+64] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+95:i+64] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+95:i+64] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -_MM_SWIZ_REG_DDDD: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+127:i+96] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+127:i+96] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+127:i+96] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+127:i+96] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -_MM_SWIZ_REG_DACB: - FOR j := 0 to 3 - i := j*128 - IF k[j*4] - dst[i+31:i] := v[i+63:i+32] - ELSE - dst[i+31:i] := src[i+31:i] - FI - IF k[j*4+1] - dst[i+63:i+32] := v[i+95:i+64] - ELSE - dst[i+63:i+32] := src[i+63:i+32] - FI - IF k[j*4+2] - dst[i+95:i+64] := v[i+31:i] - ELSE - dst[i+95:i+64] := src[i+95:i+64] - FI - IF k[j*4+3] - dst[i+127:i+96] := v[i+127:i+96] - ELSE - dst[i+127:i+96] := src[i+127:i+96] - FI - ENDFOR -ESAC -dst[MAX:512] := 0 + Miscellaneous + + + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR i := 0 to 7 + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) +ENDFOR +k[MAX:8] := 0 + + AVX512_FP16 + AVX512VL
immintrin.h
-
- - Integer - KNCNI - Swizzle - - - - - - Performs a swizzle transformation of each of the four groups of packed 4x64-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - CASE s OF -_MM_SWIZ_REG_NONE: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_DCBA: - dst[511:0] := v[511:0] -_MM_SWIZ_REG_CDAB: - FOR j := 0 to 3 - i := j*64 - IF k[j*2] - dst[i+63:i] := v[i+127:i+64] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*2+1] - dst[i+127:i+64] := v[i+63:i] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - ENDFOR -_MM_SWIZ_REG_BADC: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+191:i+128] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+255:i+192] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+63:i] - ELSE - dst[i+191:i+128] := src[i+191:i+128] - FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+127:i+64] - ELSE - dst[i+255:i+192] := src[i+255:i+192] - FI - ENDFOR -_MM_SWIZ_REG_AAAA: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+63:i] - ELSE - dst[i+127:i+64] := src[i+127:i+64] - FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+63:i] - ELSE - dst[i+191:i+128] := src[i+191:i+128] - FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+63:i] - ELSE - dst[i+255:i+192] := src[i+255:i+192] - FI - ENDFOR + Miscellaneous + + + + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR i := 0 to 7 + IF k1[i] + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) + ELSE + k[i] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR i := 0 to 15 + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) +ENDFOR +k[MAX:16] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR i := 0 to 15 + IF k1[i] + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) + ELSE + k[i] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle half-precision (16-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + off := idx[i+2:i] + dst.fp16[j] := idx[i+3] ? b.fp16[off] : a.fp16[off] +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle half-precision (16-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + off := idx[i+3:i] + dst.fp16[j] := idx[i+4] ? b.fp16[off] : a.fp16[off] +ENDFOR +dst[MAX:256] := 0 + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := b.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := b.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle half-precision (16-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + id := idx[i+3:i] + dst.fp16[j] := a.fp16[id] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle half-precision (16-bit) floating-point elements in "a" using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + id := idx[i+2:i] + dst.fp16[j] := a.fp16[id] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR i := 0 to 7 + dst.fp16[i] := SQRT(a.fp16[i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR i := 0 to 15 + dst.fp16[i] := SQRT(a.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + dst.fp16[i] := (1.0 / a.fp16[i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + dst.fp16[i] := (1.0 / a.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Load +
+ + + + + Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Store +
+ + + + Return vector of type __m256h with undefined elements. + AVX512_FP16 + AVX512VL +
immintrin.h
+ General Support +
+ + + + Return vector of type __m128h with undefined elements. + AVX512_FP16 + AVX512VL +
immintrin.h
+ General Support +
+ + + + Return vector of type __m256h with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Set +
+ + + + Return vector of type __m128h with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Set +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 TO 31 + dst.fp16[j] := a.fp16[j] + b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.fp16[j] := a.fp16[j] + b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] + b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := a.fp16[0] + b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] + b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] + b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] + b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] + b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := a.fp16[j] / b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := a.fp16[j] / b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] / b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] / b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] / b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := a.fp16[0] / b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] / b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] / b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 31 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 31 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 TO 31 + dst.fp16[j] := a.fp16[j] - b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.fp16[j] := a.fp16[j] - b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] - b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := a.fp16[0] - b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] - b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] - b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] - b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] - b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR i := 0 TO 31 + dst.fp16[i] := a.fp16[i] * b.fp16[i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR i := 0 TO 31 + dst.fp16[i] := a.fp16[i] * b.fp16[i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] * b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := a.fp16[0] * b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] * b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] * b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] * b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] * b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "src", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := a.fp16[0] + dst.fp16[1] := a.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := a.fp16[0] + dst.fp16[1] := a.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := a.fp16[0] + dst.fp16[1] := a.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := a.fp16[0] + dst.fp16[1] := a.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +tmp := a +FOR i := 0 to 15 + tmp.fp16[i] := tmp.fp16[i] + a.fp16[i+16] +ENDFOR +FOR i := 0 to 7 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+8] +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1] + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". + +tmp := a +FOR i := 0 to 15 + tmp.fp16[i] := tmp.fp16[i] * a.fp16[i+16] +ENDFOR +FOR i := 0 to 7 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+8] +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1] + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". + +tmp := a +FOR i := 0 to 15 + tmp.fp16[i] := (a.fp16[i] > a.fp16[i+16] ? a.fp16[i] : a.fp16[i+16]) +ENDFOR +FOR i := 0 to 7 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". + +tmp := a +FOR i := 0 to 15 + tmp.fp16[i] := (a.fp16[i] < a.fp16[i+16] ? tmp.fp16[i] : a.fp16[i+16]) +ENDFOR +FOR i := 0 to 7 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := ABS(v2.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 31 + k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 31 + IF k1[j] + k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 31 + k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + CASE (imm8[3:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 31 + IF k1[j] + k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := (a.fp16[0] OP b.fp16[0]) ? 1 : 0 +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := (a.fp16[0] OP b.fp16[0]) ? 1 : 0 +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +RETURN ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +RETURN ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for equality, and return the boolean result (0 or 1). + +RETURN ( a.fp16[0] == b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than, and return the boolean result (0 or 1). + +RETURN ( a.fp16[0] < b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). + +RETURN ( a.fp16[0] <= b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than, and return the boolean result (0 or 1). + +RETURN ( a.fp16[0] > b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). + +RETURN ( a.fp16[0] >= b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for not-equal, and return the boolean result (0 or 1). + +RETURN ( a.fp16[0] != b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + +RETURN ( a.fp16[0] == b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + +RETURN ( a.fp16[0] < b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + +RETURN ( a.fp16[0] <= b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + +RETURN ( a.fp16[0] > b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + +RETURN ( a.fp16[0] >= b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + +RETURN ( a.fp16[0] != b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 31 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 31 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper element of "dst". + +dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper element of "dst". + [round_note] + +dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". + +IF k[0] + dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". + +IF k[0] + dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". [sae_note] + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note] + +FOR j := 0 to 7 + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := src.fp64[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := src.fp64[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 15 + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note] + +FOR j := 0 to 15 + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := src.fp32[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := src.fp32[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note] + +dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +ELSE + dst.fp64[0] := src.fp64[0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] + +IF k[0] + dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +ELSE + dst.fp64[0] := src.fp64[0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +ELSE + dst.fp64[0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] + +IF k[0] + dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +ELSE + dst.fp64[0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] + +dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +ELSE + dst.fp32[0] := src.fp32[0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] + +IF k[0] + dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +ELSE + dst.fp32[0] := src.fp32[0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +ELSE + dst.fp32[0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] + +IF k[0] + dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +ELSE + dst.fp32[0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + +dst.dword := Convert_FP16_To_Int32(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] + +dst.dword := Convert_FP16_To_Int32(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + +dst.qword := Convert_FP16_To_Int64(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] + +dst.qword := Convert_FP16_To_Int64(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst.dword := Convert_FP16_To_Int32_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". [sae_note] + +dst.dword := Convert_FP16_To_Int32_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst.qword := Convert_FP16_To_Int64_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". [sae_note] + +dst.qword := Convert_FP16_To_Int64_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". + +dst.dword := Convert_FP16_To_UInt32(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". [sae_note] + +dst.dword := Convert_FP16_To_UInt32(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". + +dst.qword := Convert_FP16_To_UInt64(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". [round_note] + +dst.qword := Convert_FP16_To_UInt64(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + +dst.dword := Convert_FP16_To_UInt32_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". [sae_note] + +dst.dword := Convert_FP16_To_UInt32_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + +dst.qword := Convert_FP16_To_UInt64_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". [sae_note] + +dst.qword := Convert_FP16_To_UInt64_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the signed 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the unsigned 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the unsigned 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy 16-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy the lower 16-bit integer in "a" to "dst". + +dst.fp16[0] := a.fp16[0] +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy the lower half-precision (16-bit) floating-point element of "a" to "dst". + +dst[15:0] := a.fp16[0] + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy the lower half-precision (16-bit) floating-point element of "a" to "dst". + +dst[15:0] := a.fp16[0] + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy the lower half-precision (16-bit) floating-point element of "a" to "dst". + +dst[15:0] := a.fp16[0] + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +IF k[0] + dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +IF k[0] + dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +IF k[0] + dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +IF k[0] + dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst", and zero the upper elements. + +dst.fp16[0] := MEM[mem_addr].fp16[0] +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + + + Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. + +IF k[0] + dst.fp16[0] := MEM[mem_addr].fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + + Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. + +IF k[0] + dst.fp16[0] := MEM[mem_addr].fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + + Store the lower half-precision (16-bit) floating-point element from "a" into memory. + +MEM[mem_addr].fp16[0] := a.fp16[0] + + + AVX512_FP16 +
immintrin.h
+ Store +
+ + + + + + Store the lower half-precision (16-bit) floating-point element from "a" into memory using writemask "k". + +IF k[0] + MEM[mem_addr].fp16[0] := a.fp16[0] +FI + + + AVX512_FP16 +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512_FP16 +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512_FP16 +
immintrin.h
+ Store +
+ + + + + Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Move +
+ + + + + + + Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Move +
+ + + + + + Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Move +
+ + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +IF k[0] + dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +IF k[0] + dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +IF k[0] + dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +IF k[0] + dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 31 + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note] + FOR i := 0 to 31 + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note] + FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note] + FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note] + IF k[0] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note] + IF k[0] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 31 + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + FOR i := 0 TO 31 + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + IF k[0] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + IF k[0] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +IF k[0] + dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +IF k[0] + dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +IF k[0] + dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +IF k[0] + dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR i := 0 to 31 + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR i := 0 to 31 + IF k1[i] + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) + ELSE + k[i] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Test the lower half-precision (16-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k". + [fpclass_note] + k[0] := CheckFPClass_FP16(a.fp16[0], imm8[7:0]) +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Test the lower half-precision (16-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + [fpclass_note] + IF k1[0] + k[0] := CheckFPClass_FP16(a.fp16[0], imm8[7:0]) +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle half-precision (16-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + off := idx[i+4:i] + dst.fp16[j] := idx[i+5] ? b.fp16[off] : a.fp16[off] +ENDFOR +dst[MAX:512] := 0 + + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := b.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle half-precision (16-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + id := idx[i+4:i] + dst.fp16[j] := a.fp16[id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +dst.fp16[0] := (1.0 / SQRT(b.fp16[0])) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +IF k[0] + dst.fp16[0] := (1.0 / SQRT(b.fp16[0])) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +IF k[0] + dst.fp16[0] := (1.0 / SQRT(b.fp16[0])) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR i := 0 to 31 + dst.fp16[i] := SQRT(a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + [round_note] + +FOR i := 0 to 31 + dst.fp16[i] := SQRT(a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := SQRT(b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := SQRT(b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := SQRT(b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := SQRT(b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := SQRT(b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := SQRT(b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + dst.fp16[i] := (1.0 / a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +dst.fp16[0] := (1.0 / b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +IF k[0] + dst.fp16[0] := (1.0 / b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +IF k[0] + dst.fp16[0] := (1.0 / b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values. + +dst.fp16[0] := e0 +dst.fp16[1] := e1 +dst.fp16[2] := e2 +dst.fp16[3] := e3 +dst.fp16[4] := e4 +dst.fp16[5] := e5 +dst.fp16[6] := e6 +dst.fp16[7] := e7 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values. + +dst.fp16[0] := e0 +dst.fp16[1] := e1 +dst.fp16[2] := e2 +dst.fp16[3] := e3 +dst.fp16[4] := e4 +dst.fp16[5] := e5 +dst.fp16[6] := e6 +dst.fp16[7] := e7 +dst.fp16[8] := e8 +dst.fp16[9] := e9 +dst.fp16[10] := e10 +dst.fp16[11] := e11 +dst.fp16[12] := e12 +dst.fp16[13] := e13 +dst.fp16[14] := e14 +dst.fp16[15] := e15 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values. + +dst.fp16[0] := e0 +dst.fp16[1] := e1 +dst.fp16[2] := e2 +dst.fp16[3] := e3 +dst.fp16[4] := e4 +dst.fp16[5] := e5 +dst.fp16[6] := e6 +dst.fp16[7] := e7 +dst.fp16[8] := e8 +dst.fp16[9] := e9 +dst.fp16[10] := e10 +dst.fp16[11] := e11 +dst.fp16[12] := e12 +dst.fp16[13] := e13 +dst.fp16[14] := e14 +dst.fp16[15] := e15 +dst.fp16[16] := e16 +dst.fp16[17] := e17 +dst.fp16[18] := e18 +dst.fp16[19] := e19 +dst.fp16[20] := e20 +dst.fp16[21] := e21 +dst.fp16[22] := e22 +dst.fp16[23] := e23 +dst.fp16[24] := e24 +dst.fp16[25] := e25 +dst.fp16[26] := e26 +dst.fp16[27] := e27 +dst.fp16[28] := e28 +dst.fp16[29] := e29 +dst.fp16[30] := e30 +dst.fp16[31] := e31 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst.fp16[0] := e7 +dst.fp16[1] := e6 +dst.fp16[2] := e5 +dst.fp16[3] := e4 +dst.fp16[4] := e3 +dst.fp16[5] := e2 +dst.fp16[6] := e1 +dst.fp16[7] := e0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst.fp16[0] := e15 +dst.fp16[1] := e14 +dst.fp16[2] := e13 +dst.fp16[3] := e12 +dst.fp16[4] := e11 +dst.fp16[5] := e10 +dst.fp16[6] := e9 +dst.fp16[7] := e8 +dst.fp16[8] := e7 +dst.fp16[9] := e6 +dst.fp16[10] := e5 +dst.fp16[11] := e4 +dst.fp16[12] := e3 +dst.fp16[13] := e2 +dst.fp16[14] := e1 +dst.fp16[15] := e0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst.fp16[0] := e31 +dst.fp16[1] := e30 +dst.fp16[2] := e29 +dst.fp16[3] := e28 +dst.fp16[4] := e27 +dst.fp16[5] := e26 +dst.fp16[6] := e25 +dst.fp16[7] := e24 +dst.fp16[8] := e23 +dst.fp16[9] := e22 +dst.fp16[10] := e21 +dst.fp16[11] := e20 +dst.fp16[12] := e19 +dst.fp16[13] := e18 +dst.fp16[14] := e17 +dst.fp16[15] := e16 +dst.fp16[16] := e15 +dst.fp16[17] := e14 +dst.fp16[18] := e13 +dst.fp16[19] := e12 +dst.fp16[20] := e11 +dst.fp16[21] := e10 +dst.fp16[22] := e9 +dst.fp16[23] := e8 +dst.fp16[24] := e7 +dst.fp16[25] := e6 +dst.fp16[26] := e5 +dst.fp16[27] := e4 +dst.fp16[28] := e3 +dst.fp16[29] := e2 +dst.fp16[30] := e1 +dst.fp16[31] := e0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst". + +FOR i := 0 to 7 + dst.fp16[i] := a[15:0] +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst". + +FOR i := 0 to 15 + dst.fp16[i] := a[15:0] +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst". + +FOR i := 0 to 31 + dst.fp16[i] := a[15:0] +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Copy half-precision (16-bit) floating-point element "a" to the lower element of "dst", and zero the upper 7 elements. + +dst.fp16[0] := a[15:0] +dst[127:16] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + Return vector of type __m512h with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Cast vector of type "__m128h" to type "__m128". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m256". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m512". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m128d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m256d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m512d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m128i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m256i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m512i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128d" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256d" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512d" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128i" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256i" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512i" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m256h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m512h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m512h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + Return vector of type __m512h with undefined elements. + AVX512_FP16 +
immintrin.h
+ General Support +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". + +FOR i := 0 to 3 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 3 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 3 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". + +FOR i := 0 to 1 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 1 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 1 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + id := idx[i+4:i]*8 + dst[i+7:i] := a[id+7:id] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + id := idx[i+4:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + id := idx[i+4:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + id := idx[i+3:i]*8 + dst[i+7:i] := a[id+7:id] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + id := idx[i+3:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + id := idx[i+3:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + off := 8*idx[i+4:i] + dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + off := 8*idx[i+4:i] + dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + off := 8*idx[i+4:i] + dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := idx[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + off := 8*idx[i+4:i] + dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + off := 8*idx[i+3:i] + dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + off := 8*idx[i+3:i] + dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + off := 8*idx[i+3:i] + dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := idx[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + off := 8*idx[i+3:i] + dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". + +FOR i := 0 to 7 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Bit Manipulation +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 7 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Bit Manipulation +
+ + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 7 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Bit Manipulation +
+ + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + id := idx[i+5:i]*8 + dst[i+7:i] := a[id+7:id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + id := idx[i+5:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + id := idx[i+5:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + off := 8*idx[i+5:i] + dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + off := 8*idx[i+5:i] + dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + off := 8*idx[i+5:i] + dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := idx[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + off := 8*idx[i+5:i] + dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". + +FOR j := 0 to 7 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". + +FOR j := 0 to 3 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". + +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". + +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". + +FOR j := 0 to 7 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". + +FOR j := 0 to 3 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). + +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). + +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + Swizzle + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 16 +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[255:m] := 0 +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 16 +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[255:m] := src[255:m] +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 16 +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[127:m] := 0 +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 16 +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[127:m] := src[127:m] +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 8 +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[255:m] := 0 +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 8 +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[255:m] := src[255:m] +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 8 +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[127:m] := 0 +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 8 +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[127:m] := src[127:m] +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + Swizzle + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 16 +m := base_addr +FOR j := 0 to 15 + i := j*16 + IF k[j] + MEM[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 16 +m := base_addr +FOR j := 0 to 7 + i := j*16 + IF k[j] + MEM[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 8 +m := base_addr +FOR j := 0 to 31 + i := j*8 + IF k[j] + MEM[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 8 +m := base_addr +FOR j := 0 to 15 + i := j*8 + IF k[j] + MEM[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". + +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". + +FOR j := 0 to 15 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). + +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". + +FOR j := 0 to 15 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + Swizzle + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Load +
+ + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 16 +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[511:m] := 0 +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 16 +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[511:m] := src[511:m] +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 8 +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[511:m] := 0 +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 8 +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[511:m] := src[511:m] +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + Swizzle + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 16 +m := base_addr +FOR j := 0 to 31 + i := j*16 + IF k[j] + MEM[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 8 +m := base_addr +FOR j := 0 to 63 + i := j*8 + IF k[j] + MEM[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 +
immintrin.h
+ Store +
+ + + + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 15 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 15 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 15 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 15 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + + Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+15:k1] := 0 +MEM[k2+15:k2] := 0 +FOR i := 0 TO 15 + FOR j := 0 TO 15 + match := (a.dword[i] == b.dword[j] ? 1 : 0) + MEM[k1+15:k1].bit[i] |= match + MEM[k2+15:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512F +
immintrin.h
+ Mask +
+ + + + + + + Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 7 + FOR j := 0 TO 7 + match := (a.qword[i] == b.qword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512F +
immintrin.h
+ Mask +
+ + + + + + + + + Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 3 + FOR j := 0 TO 3 + match := (a.dword[i] == b.dword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512VL +
immintrin.h
+ Mask +
+ + + + + + + Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 7 + FOR j := 0 TO 7 + match := (a.dword[i] == b.dword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512VL +
immintrin.h
+ Mask +
+ + + + + + + Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 1 + FOR j := 0 TO 1 + match := (a.qword[i] == b.qword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512VL +
immintrin.h
+ Mask +
+ + + + + + + Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 3 + FOR j := 0 TO 3 + match := (a.qword[i] == b.qword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512VL +
immintrin.h
+ Mask +
+ + + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:128] := 0 + + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start". + +tmp[511:0] := a +dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control". + +start := control[7:0] +len := control[15:8] +tmp[511:0] := a +dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + + Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start". + +tmp[511:0] := a +dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control".. + +start := control[7:0] +len := control[15:8] +tmp[511:0] := a +dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Extract the lowest set bit from unsigned 32-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a". + +dst := (-a) AND a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Extract the lowest set bit from unsigned 64-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a". + +dst := (-a) AND a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 32-bit integer "a". + +dst := (a - 1) XOR a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 64-bit integer "a". + +dst := (a - 1) XOR a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a". + +dst := (a - 1) AND a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a". + +dst := (a - 1) AND a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + Compute the bitwise NOT of 32-bit integer "a" and then AND with b, and store the results in dst. + +dst[31:0] := ((NOT a[31:0]) AND b[31:0]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + Compute the bitwise NOT of 64-bit integer "a" and then AND with b, and store the results in dst. + +dst[63:0] := ((NOT a[63:0]) AND b[63:0]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". + +tmp := 0 +dst := 0 +DO WHILE ((tmp < 32) AND a[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 +OD + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". + +tmp := 0 +dst := 0 +DO WHILE ((tmp < 64) AND a[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 +OD + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". + +tmp := 0 +dst := 0 +DO WHILE ((tmp < 32) AND a[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 +OD + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". + +tmp := 0 +dst := 0 +DO WHILE ((tmp < 64) AND a[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 +OD + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + + + Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index". + +n := index[7:0] +dst := a +IF (n < 32) + dst[31:n] := 0 +FI + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index". + +n := index[7:0] +dst := a +IF (n < 64) + dst[63:n] := 0 +FI + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Deposit contiguous low bits from unsigned 32-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero. + +tmp := a +dst := 0 +m := 0 +k := 0 +DO WHILE m < 32 + IF mask[m] == 1 + dst[m] := tmp[k] + k := k + 1 + FI + m := m + 1 +OD + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Deposit contiguous low bits from unsigned 64-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero. + +tmp := a +dst := 0 +m := 0 +k := 0 +DO WHILE m < 64 + IF mask[m] == 1 + dst[m] := tmp[k] + k := k + 1 + FI + m := m + 1 +OD + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Extract bits from unsigned 32-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero. + +tmp := a +dst := 0 +m := 0 +k := 0 +DO WHILE m < 32 + IF mask[m] == 1 + dst[k] := tmp[m] + k := k + 1 + FI + m := m + 1 +OD + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Extract bits from unsigned 64-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero. + +tmp := a +dst := 0 +m := 0 +k := 0 +DO WHILE m < 64 + IF mask[m] == 1 + dst[k] := tmp[m] + k := k + 1 + FI + m := m + 1 +OD + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + + Multiply unsigned 32-bit integers "a" and "b", store the low 32-bits of the result in "dst", and store the high 32-bits in "hi". This does not read or write arithmetic flags. + +dst[31:0] := (a * b)[31:0] +MEM[hi+31:hi] := (a * b)[63:32] + + + BMI2 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply unsigned 64-bit integers "a" and "b", store the low 64-bits of the result in "dst", and store the high 64-bits in "hi". This does not read or write arithmetic flags. + +dst[63:0] := (a * b)[63:0] +MEM[hi+63:hi] := (a * b)[127:64] + + + BMI2 +
immintrin.h
+ Arithmetic +
+ + + + + + Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a". + +SSP := SSP + a[7:0] * 4 + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Increment the shadow stack pointer by 8 times the value specified in bits [7:0] of "a". + +SSP := SSP + a[7:0] * 8 + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Read the low 32-bits of the current shadow stack pointer, and store the result in "dst". + dst := SSP[31:0] + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Read the current shadow stack pointer, and store the result in "dst". + dst := SSP[63:0] + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Save the previous shadow stack pointer context. + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Restore the saved shadow stack pointer from the shadow stack restore token previously created on shadow stack by saveprevssp. + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + Write 32-bit value in "val" to a shadow stack page in memory specified by "p". + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + Write 64-bit value in "val" to a shadow stack page in memory specified by "p". + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + Write 32-bit value in "val" to a user shadow stack page in memory specified by "p". + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + Write 64-bit value in "val" to a user shadow stack page in memory specified by "p". + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Mark shadow stack pointed to by IA32_PL0_SSP as busy. + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Mark shadow stack pointed to by "p" as not busy. + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + If CET is enabled, read the low 32-bits of the current shadow stack pointer, and store the result in "dst". Otherwise return 0. + dst := SSP[31:0] + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + If CET is enabled, read the current shadow stack pointer, and store the result in "dst". Otherwise return 0. + dst := SSP[63:0] + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a". + +SSP := SSP + a[7:0] * 4 + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + + Hint to hardware that the cache line that contains "p" should be demoted from the cache closest to the processor core to a level more distant from the processor core. + + CLDEMOTE +
immintrin.h
+ Miscellaneous +
+ + + + + + Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy. + + CLFLUSHOPT +
immintrin.h
+ General Support +
+ + + + + + Write back to memory the cache line that contains "p" from any level of the cache hierarchy in the cache coherence domain. + + CLWB +
immintrin.h
+ General Support +
+ + + + + + + Reads 64-byte command pointed by "__src", formats 64-byte enqueue store data, and performs 64-byte enqueue store to memory pointed by "__dst". This intrinsics may only be used in User mode. + + ENQCMD +
immintrin.h
+ Unknown +
+ + + + + Reads 64-byte command pointed by "__src", formats 64-byte enqueue store data, and performs 64-byte enqueue store to memory pointed by "__dst" This intrinsic may only be used in Privileged mode. + + ENQCMD +
immintrin.h
+ Unknown +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + m := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) +ENDFOR +dst[MAX:256] := 0 + + + FP16C +
emmintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [sae_note] + +FOR j := 0 to 7 + i := 16*j + l := 32*j + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +ENDFOR +dst[MAX:128] := 0 + + + FP16C +
emmintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + m := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) +ENDFOR +dst[MAX:128] := 0 + + + FP16C +
emmintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [sae_note] + +FOR j := 0 to 3 + i := 16*j + l := 32*j + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +ENDFOR +dst[MAX:64] := 0 + + + FP16C +
emmintrin.h
+ Convert +
+ + + + + Read the FS segment base register and store the 32-bit result in "dst". + dst[31:0] := FS_Segment_Base_Register +dst[63:32] := 0 + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + Read the FS segment base register and store the 64-bit result in "dst". + dst[63:0] := FS_Segment_Base_Register + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + Read the GS segment base register and store the 32-bit result in "dst". + dst[31:0] := GS_Segment_Base_Register +dst[63:32] := 0 + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + Read the GS segment base register and store the 64-bit result in "dst". + dst[63:0] := GS_Segment_Base_Register + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + Write the unsigned 32-bit integer "a" to the FS segment base register. + +FS_Segment_Base_Register[31:0] := a[31:0] +FS_Segment_Base_Register[63:32] := 0 + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + Write the unsigned 64-bit integer "a" to the FS segment base register. + +FS_Segment_Base_Register[63:0] := a[63:0] + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + Write the unsigned 32-bit integer "a" to the GS segment base register. + +GS_Segment_Base_Register[31:0] := a[31:0] +GS_Segment_Base_Register[63:32] := 0 + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + Write the unsigned 64-bit integer "a" to the GS segment base register. + +GS_Segment_Base_Register[63:0] := a[63:0] + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + + + Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary. + state_x87_fpu_mmx_sse := fxrstor(MEM[mem_addr+512*8:mem_addr]) + + + FXSR +
immintrin.h
+ OS-Targeted +
+ + + + Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary. + state_x87_fpu_mmx_sse := fxrstor64(MEM[mem_addr+512*8:mem_addr]) + + + FXSR +
immintrin.h
+ OS-Targeted +
+ + + + Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor. + MEM[mem_addr+512*8:mem_addr] := fxsave(state_x87_fpu_mmx_sse) + + + FXSR +
immintrin.h
+ OS-Targeted +
+ + + + Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor. + MEM[mem_addr+512*8:mem_addr] := fxsave64(state_x87_fpu_mmx_sse) + + + FXSR +
immintrin.h
+ OS-Targeted +
+ + + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 63 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 63 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := src.byte[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 63 + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[b] + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 31 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 31 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := src.byte[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 31 + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 15 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 15 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := src.byte[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 15 + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Provides a hint to the processor to selectively reset the prediction history of the current logical processor specified by a signed 32-bit integer "__eax". + + HRESET +
immintrin.h
+ General Support +
+ + + + + + Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by "descriptor" based on the invalidation type specified in "type". + The PCID "descriptor" is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved. + The types supported are: + 0) Individual-address invalidation: If "type" is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in "descriptor", except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs. + 1) Single-context invalidation: If "type" is 1, the logical processor invalidates all mappings tagged with the PCID specified in "descriptor" except global translations. In some cases, it may invalidate mappings for other PCIDs as well. + 2) All-context invalidation: If "type" is 2, the logical processor invalidates all mappings tagged with any PCID. + 3) All-context invalidation, retaining global translations: If "type" is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring "descriptor". The instruction may also invalidate global translations as well. + +CASE type[1:0] OF +0: // individual-address invalidation retaining global translations + OP_PCID := MEM[descriptor+11:descriptor] + ADDR := MEM[descriptor+127:descriptor+64] + BREAK +1: // single PCID invalidation retaining globals + OP_PCID := MEM[descriptor+11:descriptor] + // invalidate all mappings tagged with OP_PCID except global translations + BREAK +2: // all PCID invalidation + // invalidate all mappings tagged with any PCID + BREAK +3: // all PCID invalidation retaining global translations + // invalidate all mappings tagged with any PCID except global translations + BREAK +ESAC + + + INVPCID +
immintrin.h
+ OS-Targeted +
+ + + + Flag + + + + + Decrypt 10 rounds of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], __h[383:0]) +dst := ZF + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Decrypt 10 rounds of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], __h[511:0]) +dst := ZF + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Encrypt 10 rounds of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. + MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], __h[383:0]) +dst := ZF + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Encrypt 10 rounds of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], __h[511:0]) +dst := ZF + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Wrap a 128-bit AES key from "__key" into a 384-bit key __h stored in "__h" and set IWKey's NoBackup and KeySource bits in "dst". The explicit source operand "__htype" specifies __h restrictions. + __h[383:0] := WrapKey128(__key[127:0], __htype) +dst[0] := IWKey.NoBackup +dst[4:1] := IWKey.KeySource[3:0] + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + + Wrap a 256-bit AES key from "__key_hi" and "__key_lo" into a 512-bit key stored in "__h" and set IWKey's NoBackup and KeySource bits in "dst". The 32-bit "__htype" specifies __h restrictions. + __h[383:0] := WrapKey256(__key_lo[127:0], __key_hi[127:0], __htype) +dst[0] := IWKey.NoBackup +dst[4:1] := IWKey.KeySource[3:0] + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + + Load internal wrapping key (IWKey). The 32-bit unsigned integer "__ctl" specifies IWKey's KeySource and whether backing up the key is permitted. IWKey's 256-bit encryption key is loaded from "__enkey_lo" and "__enkey_hi". IWKey's 128-bit integrity key is loaded from "__intkey". + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + + Flag + + + + + Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + FOR i := 0 to 7 + __odata[i] := AES128Decrypt (__idata[i], __h[383:0]) +ENDFOR +dst := ZF + + + KEYLOCKER_WIDE +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + FOR i := 0 to 7 + __odata[i] := AES256Decrypt (__idata[i], __h[511:0]) +ENDFOR +dst := ZF + + + KEYLOCKER_WIDE +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + FOR i := 0 to 7 + __odata[i] := AES128Encrypt (__idata[i], __h[383:0]) +ENDFOR +dst := ZF + + + KEYLOCKER_WIDE +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + FOR i := 0 to 7 + __odata[i] := AES256Encrypt (__idata[i], __h[512:0]) +ENDFOR +dst := ZF + + + KEYLOCKER_WIDE +
immintrin.h
+ Cryptography +
+ + + + + + Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i". + + + + + + + + + KNCNI +
immintrin.h
+ General Support +
+ + + + Stalls a thread without blocking other threads for 32-bit unsigned integer "r1" clock cycles. + BlockThread(r1) + + + KNCNI +
immintrin.h
+ General Support +
+ + + + Stalls a thread without blocking other threads for 64-bit unsigned integer "r1" clock cycles. + BlockThread(r1) + + + KNCNI +
immintrin.h
+ General Support +
+ + + + Set performance monitoring filtering mask to 32-bit unsigned integer "r1". + SetPerfMonMask(r1[31:0]) + + + KNCNI +
immintrin.h
+ General Support +
+ + + + Set performance monitoring filtering mask to 64-bit unsigned integer "r1". + SetPerfMonMask(r1[63:0]) + + + KNCNI +
immintrin.h
+ General Support +
+ + + + + Evicts the cache line containing the address "ptr" from cache level "level" (can be either 0 or 1). + CacheLineEvict(ptr, level) + + + + KNCNI +
immintrin.h
+ General Support +
+ + + + + Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". + +k[15:0] := (NOT a[15:0]) AND b[15:0] +k[MAX:16] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] AND b[15:0] +k[MAX:16] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + Copy 16-bit mask "a" to "k". + +k[15:0] := a[15:0] +k[MAX:16] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". + +k[15:0] := NOT a[15:0] +k[MAX:16] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] OR b[15:0] +k[MAX:16] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := NOT (a[15:0] XOR b[15:0]) +k[MAX:16] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] XOR b[15:0] +k[MAX:16] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Inserts the low byte of mask "k2" into the high byte of "dst", and copies the low byte of "k1" to the low byte of "dst". + +dst[7:0] := k1[7:0] +dst[15:8] := k2[7:0] + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Performs a bitwise AND operation between NOT of "k2" and "k1", storing the result in "dst". + dst[15:0] := NOT(k2[15:0]) & k1[15:0] + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Moves high byte from "k2" to low byte of "k1", and moves low byte of "k2" to high byte of "k1". + +tmp[7:0] := k2[15:8] +k2[15:8] := k1[7:0] +k1[7:0] := tmp[7:0] +tmp[7:0] := k2[7:0] +k2[7:0] := k1[15:8] +k1[15:8] := tmp[7:0] + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0. + dst[15:0] := k1[15:0] | k2[15:0] +IF dst == 0 + SetZF() +FI + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's. + dst[15:0] := k1[15:0] | k2[15:0] +IF PopCount(dst[15:0]) == 16 + SetCF() +FI + + + KNCNI +
immintrin.h
+ Mask +
+ + + + Converts bit mask "k1" into an integer value, storing the results in "dst". + +dst := ZeroExtend32(k1) + + + KNCNI +
immintrin.h
+ Mask +
+ + + + Converts integer "mask" into bitmask, storing the result in "dst". + +dst := mask[15:0] + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Packs masks "k1" and "k2" into the high 32 bits of "dst". The rest of "dst" is set to 0. + +dst[63:48] := k1[15:0] +dst[47:32] := k2[15:0] +dst[31:0] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Packs masks "k1" and "k2" into the low 32 bits of "dst". The rest of "dst" is set to 0. + +dst[31:16] := k1[15:0] +dst[15:0] := k2[15:0] +dst[63:32] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Extracts 16-bit value "b" from 64-bit integer "a", storing the result in "dst". + +CASE b[1:0] OF +0: dst[15:0] := a[63:48] +1: dst[15:0] := a[47:32] +2: dst[15:0] := a[31:16] +3: dst[15:0] := a[15:0] +ESAC +dst[MAX:15] := 0 + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Move the high element from "k1" to the low element of "k1", and insert the low element of "k2" into the high element of "k1". + +tmp[7:0] := k1[15:8] +k1[15:8] := k2[7:0] +k1[7:0] := tmp[7:0] + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Insert the low element of "k2" into the high element of "k1". + +k1[15:8] := k2[7:0] + + + KNCNI +
immintrin.h
+ Mask +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + KNCNI +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + KNCNI +
immintrin.h
+ Compare +
+ + + + + + + Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to single-precision (32-bit) floating-point elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + CASE bc OF + _MM_BROADCAST32_NONE: + CASE conv OF + _MM_UPCONV_PS_NONE: + n := j*32 + dst[i+31:i] := addr[n+31:n] + _MM_UPCONV_PS_FLOAT16: + n := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n]) + _MM_UPCONV_PS_UINT8: + n := j*8 + dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n]) + _MM_UPCONV_PS_SINT8: + n := j*8 + dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n]) + _MM_UPCONV_PS_UINT16: + n := j*16 + dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n]) + _MM_UPCONV_PS_SINT16: + n := j*16 + dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n]) + ESAC + _MM_BROADCAST_1X16: + CASE conv OF + _MM_UPCONV_PS_NONE: + n := j*32 + dst[i+31:i] := addr[31:0] + _MM_UPCONV_PS_FLOAT16: + n := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(addr[15:0]) + _MM_UPCONV_PS_UINT8: + n := j*8 + dst[i+31:i] := Convert_UInt8_To_FP32(addr[7:0]) + _MM_UPCONV_PS_SINT8: + n := j*8 + dst[i+31:i] := Convert_Int8_To_FP32(addr[7:0]) + _MM_UPCONV_PS_UINT16: + n := j*16 + dst[i+31:i] := Convert_UInt16_To_FP32(addr[15:0]) + _MM_UPCONV_PS_SINT16: + n := j*16 + dst[i+31:i] := Convert_Int16_To_FP32(addr[15:0]) + ESAC + _MM_BROADCAST_4X16: + mod := j%4 + CASE conv OF + _MM_UPCONV_PS_NONE: + n := mod*32 + dst[i+31:i] := addr[n+31:n] + _MM_UPCONV_PS_FLOAT16: + n := mod*16 + dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n]) + _MM_UPCONV_PS_UINT8: + n := mod*8 + dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n]) + _MM_UPCONV_PS_SINT8: + n := mod*8 + dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n]) + _MM_UPCONV_PS_UINT16: + n := mod*16 + dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n]) + _MM_UPCONV_PS_SINT16: + n := mod*16 + dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n]) + ESAC + ESAC +ENDFOR +dst[MAX:512] := 0 + + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + + Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to single-precision (32-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + IF k[j] + CASE bc OF + _MM_BROADCAST32_NONE: + CASE conv OF + _MM_UPCONV_PS_NONE: + n := j*32 + dst[i+31:i] := addr[n+31:n] + _MM_UPCONV_PS_FLOAT16: + n := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n]) + _MM_UPCONV_PS_UINT8: + n := j*8 + dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n]) + _MM_UPCONV_PS_SINT8: + n := j*8 + dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n]) + _MM_UPCONV_PS_UINT16: + n := j*16 + dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n]) + _MM_UPCONV_PS_SINT16: + n := j*16 + dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n]) + ESAC + _MM_BROADCAST_1X16: + CASE conv OF + _MM_UPCONV_PS_NONE: + n := j*32 + dst[i+31:i] := addr[31:0] + _MM_UPCONV_PS_FLOAT16: + n := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(addr[15:0]) + _MM_UPCONV_PS_UINT8: + n := j*8 + dst[i+31:i] := Convert_UInt8_To_FP32(addr[7:0]) + _MM_UPCONV_PS_SINT8: + n := j*8 + dst[i+31:i] := Convert_Int8_To_FP32(addr[7:0]) + _MM_UPCONV_PS_UINT16: + n := j*16 + dst[i+31:i] := Convert_UInt16_To_FP32(addr[15:0]) + _MM_UPCONV_PS_SINT16: + n := j*16 + dst[i+31:i] := Convert_Int16_To_FP32(addr[15:0]) + ESAC + _MM_BROADCAST_4X16: + mod := j%4 + CASE conv OF + _MM_UPCONV_PS_NONE: + n := mod*32 + dst[i+31:i] := addr[n+31:n] + _MM_UPCONV_PS_FLOAT16: + n := mod*16 + dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n]) + _MM_UPCONV_PS_UINT8: + n := mod*8 + dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n]) + _MM_UPCONV_PS_SINT8: + n := mod*8 + dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n]) + _MM_UPCONV_PS_UINT16: + n := mod*16 + dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n]) + _MM_UPCONV_PS_SINT16: + n := mod*16 + dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n]) + ESAC + ESAC + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 32-bit integer elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + CASE bc OF + _MM_BROADCAST32_NONE: + CASE conv OF + _MM_UPCONV_EPI32_NONE: + n := j*32 + dst[i+31:i] := addr[n+31:n] + _MM_UPCONV_EPI32_UINT8: + n := j*8 + dst[i+31:i] := ZeroExtend32(addr[n+7:n]) + _MM_UPCONV_EPI32_SINT8: + n := j*8 + dst[i+31:i] := SignExtend32(addr[n+7:n]) + _MM_UPCONV_EPI32_UINT16: + n := j*16 + dst[i+31:i] := ZeroExtend32(addr[n+15:n]) + _MM_UPCONV_EPI32_SINT16: + n := j*16 + dst[i+31:i] := SignExtend32(addr[n+15:n]) + ESAC + _MM_BROADCAST_1X16: + CASE conv OF + _MM_UPCONV_EPI32_NONE: + n := j*32 + dst[i+31:i] := addr[31:0] + _MM_UPCONV_EPI32_UINT8: + n := j*8 + dst[i+31:i] := ZeroExtend32(addr[7:0]) + _MM_UPCONV_EPI32_SINT8: + n := j*8 + dst[i+31:i] := SignExtend32(addr[7:0]) + _MM_UPCONV_EPI32_UINT16: + n := j*16 + dst[i+31:i] := ZeroExtend32(addr[15:0]) + _MM_UPCONV_EPI32_SINT16: + n := j*16 + dst[i+31:i] := SignExtend32(addr[15:0]) + ESAC + _MM_BROADCAST_4X16: + mod := j%4 + CASE conv OF + _MM_UPCONV_EPI32_NONE: + n := mod*32 + dst[i+31:i] := addr[n+31:n] + _MM_UPCONV_EPI32_UINT8: + n := mod*8 + dst[i+31:i] := ZeroExtend32(addr[n+7:n]) + _MM_UPCONV_EPI32_SINT8: + n := mod*8 + dst[i+31:i] := SignExtend32(addr[n+7:n]) + _MM_UPCONV_EPI32_UINT16: + n := mod*16 + dst[i+31:i] := ZeroExtend32(addr[n+15:n]) + _MM_UPCONV_EPI32_SINT16: + n := mod*16 + dst[i+31:i] := SignExtend32(addr[n+15:n]) + ESAC + ESAC +ENDFOR +dst[MAX:512] := 0 + + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + + Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 32-bit integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + IF k[j] + CASE bc OF + _MM_BROADCAST32_NONE: + CASE conv OF + _MM_UPCONV_EPI32_NONE: + n := j*32 + dst[i+31:i] := addr[n+31:n] + _MM_UPCONV_EPI32_UINT8: + n := j*8 + dst[i+31:i] := ZeroExtend32(addr[n+7:n]) + _MM_UPCONV_EPI32_SINT8: + n := j*8 + dst[i+31:i] := SignExtend32(addr[n+7:n]) + _MM_UPCONV_EPI32_UINT16: + n := j*16 + dst[i+31:i] := ZeroExtend32(addr[n+15:n]) + _MM_UPCONV_EPI32_SINT16: + n := j*16 + dst[i+31:i] := SignExtend32(addr[n+15:n]) + ESAC + _MM_BROADCAST_1X16: + CASE conv OF + _MM_UPCONV_EPI32_NONE: + n := j*32 + dst[i+31:i] := addr[31:0] + _MM_UPCONV_EPI32_UINT8: + n := j*8 + dst[i+31:i] := ZeroExtend32(addr[7:0]) + _MM_UPCONV_EPI32_SINT8: + n := j*8 + dst[i+31:i] := SignExtend32(addr[7:0]) + _MM_UPCONV_EPI32_UINT16: + n := j*16 + dst[i+31:i] := ZeroExtend32(addr[15:0]) + _MM_UPCONV_EPI32_SINT16: + n := j*16 + dst[i+31:i] := SignExtend32(addr[15:0]) + ESAC + _MM_BROADCAST_4X16: + mod := j%4 + CASE conv OF + _MM_UPCONV_EPI32_NONE: + n := mod*32 + dst[i+31:i] := addr[n+31:n] + _MM_UPCONV_EPI32_UINT8: + n := mod*8 + dst[i+31:i] := ZeroExtend32(addr[n+7:n]) + _MM_UPCONV_EPI32_SINT8: + n := mod*8 + dst[i+31:i] := SignExtend32(addr[n+7:n]) + _MM_UPCONV_EPI32_UINT16: + n := mod*16 + dst[i+31:i] := ZeroExtend32(addr[n+15:n]) + _MM_UPCONV_EPI32_SINT16: + n := mod*16 + dst[i+31:i] := SignExtend32(addr[n+15:n]) + ESAC + ESAC + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to double-precision (64-bit) floating-point elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + CASE bc OF + _MM_BROADCAST64_NONE: + CASE conv OF + _MM_UPCONV_PD_NONE: + n := j*64 + dst[i+63:i] := addr[n+63:n] + ESAC + _MM_BROADCAST_1X8: + CASE conv OF + _MM_UPCONV_PD_NONE: + n := j*64 + dst[i+63:i] := addr[63:0] + ESAC + _MM_BROADCAST_4X8: + mod := j%4 + CASE conv OF + _MM_UPCONV_PD_NONE: + n := mod*64 + dst[i+63:i] := addr[n+63:n] + ESAC + ESAC +ENDFOR +dst[MAX:512] := 0 + + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + + Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + IF k[j] + CASE bc OF + _MM_BROADCAST64_NONE: + CASE conv OF + _MM_UPCONV_PD_NONE: + n := j*64 + dst[i+63:i] := addr[n+63:n] + ESAC + _MM_BROADCAST_1X8: + CASE conv OF + _MM_UPCONV_PD_NONE: + n := j*64 + dst[i+63:i] := addr[63:0] + ESAC + _MM_BROADCAST_4X8: + mod := j%4 + CASE conv OF + _MM_UPCONV_PD_NONE: + n := mod*64 + dst[i+63:i] := addr[n+63:n] + ESAC + ESAC + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 64-bit integer elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + CASE bc OF + _MM_BROADCAST64_NONE: + CASE conv OF + _MM_UPCONV_EPI64_NONE: + n := j*64 + dst[i+63:i] := addr[n+63:n] + ESAC + _MM_BROADCAST_1X8: + CASE conv OF + _MM_UPCONV_EPI64_NONE: + n := j*64 + dst[i+63:i] := addr[63:0] + ESAC + _MM_BROADCAST_4X8: + mod := j%4 + CASE conv OF + _MM_UPCONV_EPI64_NONE: + n := mod*64 + dst[i+63:i] := addr[n+63:n] + ESAC + ESAC +ENDFOR +dst[MAX:512] := 0 + + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + + Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 64-bit integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + IF k[j] + CASE bc OF + _MM_BROADCAST64_NONE: + CASE conv OF + _MM_UPCONV_EPI64_NONE: + n := j*64 + dst[i+63:i] := addr[n+63:n] + ESAC + _MM_BROADCAST_1X8: + CASE conv OF + _MM_UPCONV_EPI64_NONE: + n := j*64 + dst[i+63:i] := addr[63:0] + ESAC + _MM_BROADCAST_4X8: + mod := j%4 + CASE conv OF + _MM_UPCONV_EPI64_NONE: + n := mod*64 + dst[i+63:i] := addr[n+63:n] + ESAC + ESAC + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_EPI32_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_EPI32_UINT8: + RETURN ZeroExtend32(MEM[addr + offset]) + _MM_UPCONV_EPI32_SINT8: + RETURN SignExtend32(MEM[addr + offset]) + _MM_UPCONV_EPI32_UINT16: + RETURN ZeroExtend32(MEM[addr + 2*offset]) + _MM_UPCONV_EPI32_SINT16: + RETURN SignExtend32(MEM[addr + 2*offset]) + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_EPI32_NONE: + RETURN 4 + _MM_UPCONV_EPI32_UINT8: + RETURN 1 + _MM_UPCONV_EPI32_SINT8: + RETURN 1 + _MM_UPCONV_EPI32_UINT16: + RETURN 2 + _MM_UPCONV_EPI32_SINT16: + RETURN 2 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +upSize := UPCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 15 + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*upSize % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) + FI + loadOffset := loadOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_EPI32_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_EPI32_UINT8: + RETURN ZeroExtend32(MEM[addr + offset]) + _MM_UPCONV_EPI32_SINT8: + RETURN SignExtend32(MEM[addr + offset]) + _MM_UPCONV_EPI32_UINT16: + RETURN ZeroExtend32(MEM[addr + 2*offset]) + _MM_UPCONV_EPI32_SINT16: + RETURN SignExtend32(MEM[addr + 2*offset]) + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_EPI32_NONE: + RETURN 4 + _MM_UPCONV_EPI32_UINT8: + RETURN 1 + _MM_UPCONV_EPI32_SINT8: + RETURN 1 + _MM_UPCONV_EPI32_UINT16: + RETURN 2 + _MM_UPCONV_EPI32_SINT16: + RETURN 2 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +upSize := UPCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 15 + IF k[j] + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*upSize % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) + FI + loadOffset := loadOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + +DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_EPI32_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_EPI32_UINT8: + RETURN ZeroExtend32(MEM[addr + offset]) + _MM_UPCONV_EPI32_SINT8: + RETURN SignExtend32(MEM[addr + offset]) + _MM_UPCONV_EPI32_UINT16: + RETURN ZeroExtend32(MEM[addr + 2*offset]) + _MM_UPCONV_EPI32_SINT16: + RETURN SignExtend32(MEM[addr + 2*offset]) + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_EPI32_NONE: + RETURN 4 + _MM_UPCONV_EPI32_UINT8: + RETURN 1 + _MM_UPCONV_EPI32_SINT8: + RETURN 1 + _MM_UPCONV_EPI32_UINT16: + RETURN 2 + _MM_UPCONV_EPI32_SINT16: + RETURN 2 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +upSize := UPCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) + loadOffset := loadOffset + 1 + IF (mt + loadOffset * upSize) % 64 == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_EPI32_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_EPI32_UINT8: + RETURN ZeroExtend32(MEM[addr + offset]) + _MM_UPCONV_EPI32_SINT8: + RETURN SignExtend32(MEM[addr + offset]) + _MM_UPCONV_EPI32_UINT16: + RETURN ZeroExtend32(MEM[addr + 2*offset]) + _MM_UPCONV_EPI32_SINT16: + RETURN SignExtend32(MEM[addr + 2*offset]) + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_EPI32_NONE: + RETURN 4 + _MM_UPCONV_EPI32_UINT8: + RETURN 1 + _MM_UPCONV_EPI32_SINT8: + RETURN 1 + _MM_UPCONV_EPI32_UINT16: + RETURN 2 + _MM_UPCONV_EPI32_SINT16: + RETURN 2 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +upSize := UPCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 15 + IF k[j] + i := j*32 + dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) + loadOffset := loadOffset + 1 + IF (mt + loadOffset * upSize) % 64 == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_EPI64_NONE: + RETURN MEM[addr + 8*offset] + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_EPI64_NONE: + RETURN 8 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +upSize := UPCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 7 + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*upSize) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + FI + loadOffset := loadOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_EPI64_NONE: + RETURN MEM[addr + 8*offset] + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_EPI64_NONE: + RETURN 8 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +upSize := UPCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 7 + IF k[j] + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*upSize) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + FI + loadOffset := loadOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + +DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_EPI64_NONE: + RETURN MEM[addr + 8*offset] + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_EPI64_NONE: + RETURN 8 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +upSize := UPCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + loadOffset := loadOffset + 1 + IF (addr + loadOffset*upSize % 64) == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_EPI64_NONE: + RETURN MEM[addr + 8*offset] + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_EPI64_NONE: + RETURN 8 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +upSize := UPCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 7 + IF k[j] + i := j*64 + dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + loadOffset := loadOffset + 1 + IF (addr + loadOffset*upSize % 64) == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PS_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) + _MM_UPCONV_PS_UINT8: + RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_SINT8: + RETURN Convert_Int8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_UINT16: + RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) + _MM_UPCONV_PS_SINT16: + RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_PS_NONE: + RETURN 4 + _MM_UPCONV_PS_FLOAT16: + RETURN 2 + _MM_UPCONV_PS_UINT8: + RETURN 1 + _MM_UPCONV_PS_SINT8: + RETURN 1 + _MM_UPCONV_PS_UINT16: + RETURN 2 + _MM_UPCONV_PS_SINT16: + RETURN 2 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +upSize := UPCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 15 + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*upSize % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) + FI + loadOffset := loadOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PS_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) + _MM_UPCONV_PS_UINT8: + RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_SINT8: + RETURN Convert_Int8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_UINT16: + RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) + _MM_UPCONV_PS_SINT16: + RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) + ESAC +} +DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PS_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) + _MM_UPCONV_PS_UINT8: + RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_SINT8: + RETURN Convert_Int8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_UINT16: + RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) + _MM_UPCONV_PS_SINT16: + RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +upSize := UPCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 15 + IF k[j] + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*upSize % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) + FI + loadOffset := loadOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PS_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) + _MM_UPCONV_PS_UINT8: + RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_SINT8: + RETURN Convert_Int8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_UINT16: + RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) + _MM_UPCONV_PS_SINT16: + RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) + ESAC +} +DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PS_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) + _MM_UPCONV_PS_UINT8: + RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_SINT8: + RETURN Convert_Int8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_UINT16: + RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) + _MM_UPCONV_PS_SINT16: + RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +upSize := UPCONVERTSIZE(conv) +addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) + loadOffset := loadOffset + 1 + IF (mt + loadOffset * upSize) % 64 == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PS_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) + _MM_UPCONV_PS_UINT8: + RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_SINT8: + RETURN Convert_Int8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_UINT16: + RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) + _MM_UPCONV_PS_SINT16: + RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) + ESAC +} +DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PS_NONE: + RETURN MEM[addr + 4*offset] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) + _MM_UPCONV_PS_UINT8: + RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_SINT8: + RETURN Convert_Int8_To_FP32(MEM[addr + offset]) + _MM_UPCONV_PS_UINT16: + RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) + _MM_UPCONV_PS_SINT16: + RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +upSize := UPCONVERTSIZE(conv) +addr := MEM[mt] +FOR j := 0 to 15 + IF k[j] + i := j*32 + dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) + loadOffset := loadOffset + 1 + IF (mt + loadOffset * upSize) % 64 == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PD_NONE: + RETURN MEM[addr + 8*offset] + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_PD_NONE: + RETURN 8 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +upSize := UPCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 7 + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*upSize) % 64 == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + FI + loadOffset := loadOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PD_NONE: + RETURN MEM[addr + 8*offset] + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_PD_NONE: + RETURN 8 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +upSize := UPCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 7 + IF k[j] + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*upSize) % 64 == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + FI + loadOffset := loadOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + +DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PD_NONE: + RETURN MEM[addr + 8*offset] + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_PD_NONE: + RETURN 8 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +upSize := UPCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + loadOffset := loadOffset + 1 + IF (mt + loadOffset * upSize) % 64 == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elemenst are skipped when the corresponding mask bit is not set). + +DEFINE UPCONVERT(addr, offset, convertTo) { + CASE conv OF + _MM_UPCONV_PD_NONE: + RETURN MEM[addr + 8*offset] + ESAC +} +DEFINE UPCONVERTSIZE(convertTo) { + CASE conv OF + _MM_UPCONV_PD_NONE: + RETURN 8 + ESAC +} +dst[511:0] := src[511:0] +loadOffset := 0 +upSize := UPCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 7 + IF k[j] + i := j*64 + dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + loadOffset := loadOffset + 1 + IF (mt + loadOffset * upSize) % 64 == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". + dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +addr := mt-64 +FOR j := 0 to 15 + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*4 % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + tmp := MEM[addr + loadOffset*4] + dst[i+31:i] := tmp[i+31:i] + FI + loadOffset := loadOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +addr := mt-64 +FOR j := 0 to 15 + IF k[j] + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*4 % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + tmp := MEM[addr + loadOffset*4] + dst[i+31:i] := tmp[i+31:i] + FI + loadOffset := loadOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". + +dst[511:0] := src[511:0] +loadOffset := 0 +addr := mt +FOR j := 0 to 15 + i := j*32 + tmp := MEM[addr + loadOffset*4] + dst[i+31:i] := tmp[i+31:i] + loadOffset := loadOffset + 1 + IF (mt + loadOffset * 4) % 64 == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +dst[511:0] := src[511:0] +loadOffset := 0 +addr := mt +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp := MEM[addr + loadOffset*4] + dst[i+31:i] := tmp[i+31:i] + loadOffset := loadOffset + 1 + IF (mt + loadOffset * 4) % 64 == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". + dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +addr := mt-64 +FOR j := 0 to 7 + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*8) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + tmp := MEM[addr + loadOffset*8] + dst[i+63:i] := tmp[i+63:i] + FI + loadOffset := loadOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +addr := mt-64 +FOR j := 0 to 7 + IF k[j] + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*8) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + tmp := MEM[addr + loadOffset*8] + dst[i+63:i] := tmp[i+63:i] + FI + loadOffset := loadOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". + +dst[511:0] := src[511:0] +loadOffset := 0 +addr := mt +FOR j := 0 to 7 + i := j*64 + tmp := MEM[addr + loadOffset*8] + dst[i+63:i] := tmp[i+63:i] + loadOffset := loadOffset + 1 + IF (addr + loadOffset*8 % 64) == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +dst[511:0] := src[511:0] +loadOffset := 0 +addr := mt +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp := MEM[addr + loadOffset*8] + dst[i+63:i] := tmp[i+63:i] + loadOffset := loadOffset + 1 + IF (addr + loadOffset*8 % 64) == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". + dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +addr := mt-64 +FOR j := 0 to 15 + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*4 % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + tmp := MEM[addr + loadOffset*4] + dst[i+31:i] := tmp[i+31:i] + FI + loadOffset := loadOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads the high-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +addr := mt-64 +FOR j := 0 to 15 + IF k[j] + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*4 % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + tmp := MEM[addr + loadOffset*4] + dst[i+31:i] := tmp[i+31:i] + FI + loadOffset := loadOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". + +dst[511:0] := src[511:0] +loadOffset := 0 +addr := mt +FOR j := 0 to 15 + i := j*32 + tmp := MEM[addr + loadOffset*4] + dst[i+31:i] := tmp[i+31:i] + loadOffset := loadOffset + 1 + IF (mt + loadOffset * 4) % 64 == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +dst[511:0] := src[511:0] +loadOffset := 0 +addr := mt +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp := MEM[addr + loadOffset*4] + dst[i+31:i] := tmp[i+31:i] + loadOffset := loadOffset + 1 + IF (mt + loadOffset * 4) % 64 == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". + dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +addr := mt-64 +FOR j := 0 to 7 + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*8) % 64 == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + tmp := MEM[addr + loadOffset*8] + dst[i+63:i] := tmp[i+63:i] + FI + loadOffset := loadOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + dst[511:0] := src[511:0] +loadOffset := 0 +foundNext64BytesBoundary := false +addr := mt-64 +FOR j := 0 to 7 + IF k[j] + IF foundNext64BytesBoundary == false + IF (addr + (loadOffset + 1)*8) % 64 == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + tmp := MEM[addr + loadOffset*8] + dst[i+63:i] := tmp[i+63:i] + FI + loadOffset := loadOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". + +dst[511:0] := src[511:0] +loadOffset := 0 +addr := mt +FOR j := 0 to 7 + i := j*64 + tmp := MEM[addr + loadOffset*8] + dst[i+63:i] := tmp[i+63:i] + loadOffset := loadOffset + 1 + IF ((addr + 8*loadOffset) % 64) == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +dst[511:0] := src[511:0] +loadOffset := 0 +addr := mt +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp := MEM[addr + loadOffset*8] + dst[i+63:i] := tmp[i+63:i] + loadOffset := loadOffset + 1 + IF ((addr + 8*loadOffset) % 64) == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Up-converts 8 single-precision (32-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst". "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_UPCONV_EPI32_NONE: dst[i+31:i] := MEM[addr+31:addr] + _MM_UPCONV_EPI32_UINT8: dst[i+31:i] := ZeroExtend32(MEM[addr+7:addr]) + _MM_UPCONV_EPI32_SINT8: dst[i+31:i] := SignExtend32(MEM[addr+7:addr]) + _MM_UPCONV_EPI32_UINT16: dst[i+31:i] := ZeroExtend32(MEM[addr+15:addr]) + _MM_UPCONV_EPI32_SINT16: dst[i+31:i] := SignExtend32(MEM[addr+15:addr]) + ESAC +ENDFOR +dst[MAX:256] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + + + Up-converts 8 single-precision (32-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_UPCONV_EPI32_NONE: dst[i+31:i] := MEM[addr+31:addr] + _MM_UPCONV_EPI32_UINT8: dst[i+31:i] := ZeroExtend32(MEM[addr+7:addr]) + _MM_UPCONV_EPI32_SINT8: dst[i+31:i] := SignExtend32(MEM[addr+7:addr]) + _MM_UPCONV_EPI32_UINT16: dst[i+31:i] := ZeroExtend32(MEM[addr+15:addr]) + _MM_UPCONV_EPI32_SINT16: dst[i+31:i] := SignExtend32(MEM[addr+15:addr]) + ESAC + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst". "hint" indicates to the processor whether the load is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_UPCONV_EPI64_NONE: dst[i+63:i] := MEM[addr+63:addr] + ESAC +ENDFOR +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + + + Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the load is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_UPCONV_EPI64_NONE: dst[i+63:i] := MEM[addr+63:addr] + ESAC + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Up-converts 8 memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in the lower half of "dst". "hint" indicates to the processor whether the load is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_UPCONV_PS_NONE: dst[i+31:i] := MEM[addr+31:addr] + _MM_UPCONV_PS_FLOAT16: dst[i+31:i] := Convert_FP16_To_FP32(MEM[addr+15:addr]) + _MM_UPCONV_PS_UINT8: dst[i+31:i] := Convert_UInt8_To_FP32(MEM[addr+7:addr]) + _MM_UPCONV_PS_SINT8: dst[i+31:i] := Convert_Int8_To_FP32(MEM[addr+7:addr]) + _MM_UPCONV_PS_UINT16: dst[i+31:i] := Convert_UInt16_To_FP32(MEM[addr+15:addr]) + _MM_UPCONV_PS_SINT16: dst[i+31:i] := Convert_Int16_To_FP32(MEM[addr+15:addr]) + ESAC +ENDFOR +dst[MAX:256] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + + + Up-converts 8 memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in the lower half of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the load is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_UPCONV_PS_NONE: dst[i+31:i] := MEM[addr+31:addr] + _MM_UPCONV_PS_FLOAT16: dst[i+31:i] := Convert_FP16_To_FP32(MEM[addr+15:addr]) + _MM_UPCONV_PS_UINT8: dst[i+31:i] := Convert_UInt8_To_FP32(MEM[addr+7:addr]) + _MM_UPCONV_PS_SINT8: dst[i+31:i] := Convert_Int8_To_FP32(MEM[addr+7:addr]) + _MM_UPCONV_PS_UINT16: dst[i+31:i] := Convert_UInt16_To_FP32(MEM[addr+15:addr]) + _MM_UPCONV_PS_SINT16: dst[i+31:i] := Convert_Int16_To_FP32(MEM[addr+15:addr]) + ESAC + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst". "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_UPCONV_PD_NONE: dst[i+63:i] := MEM[addr+63:addr] + ESAC +ENDFOR +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + + + Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_UPCONV_PD_NONE: dst[i+63:i] := MEM[addr+63:addr] + ESAC + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads 8 32-bit integer memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst". + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:256] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads 8 32-bit integer memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + Loads 8 single-precision (32-bit) floating-point memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst". + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:256] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Loads 8 single-precision (32-bit) floating-point memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + KNCNI +
immintrin.h
+ Load +
+ + + + + Performs a swizzle transformation of each of the four groups of packed 4xsingle-precision (32-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst". + CASE s OF +_MM_SWIZ_REG_NONE: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_DCBA: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_CDAB: + FOR j := 0 to 7 + i := j*64 + dst[i+31:i] := v[i+63:i+32] + dst[i+63:i+32] := v[i+31:i] + ENDFOR +_MM_SWIZ_REG_BADC: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+95:i+64] + dst[i+63:i+32] := v[i+127:i+96] + dst[i+95:i+64] := v[i+31:i] + dst[i+127:i+96] := v[i+63:i+32] + ENDFOR +_MM_SWIZ_REG_AAAA: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+31:i] + dst[i+63:i+32] := v[i+31:i] + dst[i+95:i+64] := v[i+31:i] + dst[i+127:i+96] := v[i+31:i] + ENDFOR +_MM_SWIZ_REG_BBBB: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+63:i+32] + dst[i+63:i+32] := v[i+63:i+32] + dst[i+95:i+64] := v[i+63:i+32] + dst[i+127:i+96] := v[i+63:i+32] + ENDFOR +_MM_SWIZ_REG_CCCC: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+95:i+64] + dst[i+63:i+32] := v[i+95:i+64] + dst[i+95:i+64] := v[i+95:i+64] + dst[i+127:i+96] := v[i+95:i+64] + ENDFOR +_MM_SWIZ_REG_DDDD: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+127:i+96] + dst[i+63:i+32] := v[i+127:i+96] + dst[i+95:i+64] := v[i+127:i+96] + dst[i+127:i+96] := v[i+127:i+96] + ENDFOR +_MM_SWIZ_REG_DACB: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+63:i+32] + dst[i+63:i+32] := v[i+95:i+64] + dst[i+95:i+64] := v[i+31:i] + dst[i+127:i+96] := v[i+127:i+96] + ENDFOR +ESAC +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst". + CASE s OF +_MM_SWIZ_REG_NONE: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_DCBA: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_CDAB: + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := v[i+127:i+64] + dst[i+127:i+64] := v[i+63:i] + ENDFOR +_MM_SWIZ_REG_BADC: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+191:i+128] + dst[i+127:i+64] := v[i+255:i+192] + dst[i+191:i+128] := v[i+63:i] + dst[i+255:i+192] := v[i+127:i+64] + ENDFOR +_MM_SWIZ_REG_AAAA: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+63:i] + dst[i+127:i+64] := v[i+63:i] + dst[i+191:i+128] := v[i+63:i] + dst[i+255:i+192] := v[i+63:i] + ENDFOR +_MM_SWIZ_REG_BBBB: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+127:i+63] + dst[i+127:i+64] := v[i+127:i+63] + dst[i+191:i+128] := v[i+127:i+63] + dst[i+255:i+192] := v[i+127:i+63] + ENDFOR +_MM_SWIZ_REG_CCCC: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+191:i+128] + dst[i+127:i+64] := v[i+191:i+128] + dst[i+191:i+128] := v[i+191:i+128] + dst[i+255:i+192] := v[i+191:i+128] + ENDFOR +_MM_SWIZ_REG_DDDD: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+255:i+192] + dst[i+127:i+64] := v[i+255:i+192] + dst[i+191:i+128] := v[i+255:i+192] + dst[i+255:i+192] := v[i+255:i+192] + ENDFOR +_MM_SWIZ_REG_DACB: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+127:i+64] + dst[i+127:i+64] := v[i+191:i+128] + dst[i+191:i+128] := v[i+63:i] + dst[i+255:i+192] := v[i+255:i+192] + ENDFOR +ESAC +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + Performs a swizzle transformation of each of the four groups of packed 4x 32-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst". + CASE s OF +_MM_SWIZ_REG_NONE: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_DCBA: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_CDAB: + FOR j := 0 to 7 + i := j*64 + dst[i+31:i] := v[i+63:i+32] + dst[i+63:i+32] := v[i+31:i] + ENDFOR +_MM_SWIZ_REG_BADC: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+95:i+64] + dst[i+63:i+32] := v[i+127:i+96] + dst[i+95:i+64] := v[i+31:i] + dst[i+127:i+96] := v[i+63:i+32] + ENDFOR +_MM_SWIZ_REG_AAAA: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+31:i] + dst[i+63:i+32] := v[i+31:i] + dst[i+95:i+64] := v[i+31:i] + dst[i+127:i+96] := v[i+31:i] + ENDFOR +_MM_SWIZ_REG_BBBB: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+63:i+32] + dst[i+63:i+32] := v[i+63:i+32] + dst[i+95:i+64] := v[i+63:i+32] + dst[i+127:i+96] := v[i+63:i+32] + ENDFOR +_MM_SWIZ_REG_CCCC: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+95:i+64] + dst[i+63:i+32] := v[i+95:i+64] + dst[i+95:i+64] := v[i+95:i+64] + dst[i+127:i+96] := v[i+95:i+64] + ENDFOR +_MM_SWIZ_REG_DDDD: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+127:i+96] + dst[i+63:i+32] := v[i+127:i+96] + dst[i+95:i+64] := v[i+127:i+96] + dst[i+127:i+96] := v[i+127:i+96] + ENDFOR +_MM_SWIZ_REG_DACB: + FOR j := 0 to 3 + i := j*128 + dst[i+31:i] := v[i+63:i+32] + dst[i+63:i+32] := v[i+95:i+64] + dst[i+95:i+64] := v[i+31:i] + dst[i+127:i+96] := v[i+127:i+96] + ENDFOR +ESAC +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + Performs a swizzle transformation of each of the two groups of packed 4x64-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst". + CASE s OF +_MM_SWIZ_REG_NONE: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_DCBA: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_CDAB: + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := v[i+127:i+64] + dst[i+127:i+64] := v[i+63:i] + ENDFOR +_MM_SWIZ_REG_BADC: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+191:i+128] + dst[i+127:i+64] := v[i+255:i+192] + dst[i+191:i+128] := v[i+63:i] + dst[i+255:i+192] := v[i+127:i+64] + ENDFOR +_MM_SWIZ_REG_AAAA: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+63:i] + dst[i+127:i+64] := v[i+63:i] + dst[i+191:i+128] := v[i+63:i] + dst[i+255:i+192] := v[i+63:i] + ENDFOR +_MM_SWIZ_REG_BBBB: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+127:i+63] + dst[i+127:i+64] := v[i+127:i+63] + dst[i+191:i+128] := v[i+127:i+63] + dst[i+255:i+192] := v[i+127:i+63] + ENDFOR +_MM_SWIZ_REG_CCCC: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+191:i+128] + dst[i+127:i+64] := v[i+191:i+128] + dst[i+191:i+128] := v[i+191:i+128] + dst[i+255:i+192] := v[i+191:i+128] + ENDFOR +_MM_SWIZ_REG_DDDD: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+255:i+192] + dst[i+127:i+64] := v[i+255:i+192] + dst[i+191:i+128] := v[i+255:i+192] + dst[i+255:i+192] := v[i+255:i+192] + ENDFOR +_MM_SWIZ_REG_DACB: + FOR j := 0 to 1 + i := j*256 + dst[i+63:i] := v[i+127:i+64] + dst[i+127:i+64] := v[i+191:i+128] + dst[i+191:i+128] := v[i+63:i] + dst[i+255:i+192] := v[i+255:i+192] + ENDFOR +ESAC +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + + + Performs a swizzle transformation of each of the four groups of packed 4x single-precision (32-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + CASE s OF +_MM_SWIZ_REG_NONE: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_DCBA: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_CDAB: + FOR j := 0 to 7 + i := j*64 + IF k[j*2] + dst[i+31:i] := v[i+63:i+32] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*2+1] + dst[i+63:i+32] := v[i+31:i] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + ENDFOR +_MM_SWIZ_REG_BADC: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+95:i+64] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+127:i+96] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+31:i] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+63:i+32] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_AAAA: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+31:i] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+31:i] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+31:i] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_BBBB: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+63:i+32] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+63:i+32] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+63:i+32] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+63:i+32] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_CCCC: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+95:i+64] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+95:i+64] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+95:i+64] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+95:i+64] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_DDDD: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+127:i+96] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+127:i+96] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+127:i+96] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+127:i+96] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_DACB: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+63:i+32] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+95:i+64] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+31:i] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+127:i+96] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +ESAC +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + + + Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + CASE s OF +_MM_SWIZ_REG_NONE: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_DCBA: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_CDAB: + FOR j := 0 to 3 + i := j*64 + IF k[j*2] + dst[i+63:i] := v[i+127:i+64] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*2+1] + dst[i+127:i+64] := v[i+63:i] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + ENDFOR +_MM_SWIZ_REG_BADC: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+191:i+128] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+255:i+192] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+63:i] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+127:i+64] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +_MM_SWIZ_REG_AAAA: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+63:i] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+63:i] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+63:i] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +_MM_SWIZ_REG_BBBB: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+127:i+63] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+127:i+63] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+127:i+63] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+127:i+63] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +_MM_SWIZ_REG_CCCC: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+191:i+128] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+191:i+128] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+191:i+128] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+191:i+128] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +_MM_SWIZ_REG_DDDD: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+255:i+192] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+255:i+192] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+255:i+192] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+255:i+192] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +_MM_SWIZ_REG_DACB: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+127:i+64] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+191:i+128] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+63:i] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+255:i+192] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +ESAC +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + + + Performs a swizzle transformation of each of the four groups of packed 4x32-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + CASE s OF +_MM_SWIZ_REG_NONE: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_DCBA: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_CDAB: + FOR j := 0 to 7 + i := j*64 + IF k[j*2] + dst[i+31:i] := v[i+63:i+32] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*2+1] + dst[i+63:i+32] := v[i+31:i] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + ENDFOR +_MM_SWIZ_REG_BADC: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+95:i+64] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+127:i+96] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+31:i] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+63:i+32] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_AAAA: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+31:i] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+31:i] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+31:i] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_BBBB: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+63:i+32] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+63:i+32] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+63:i+32] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+63:i+32] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_CCCC: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+95:i+64] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+95:i+64] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+95:i+64] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+95:i+64] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_DDDD: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+127:i+96] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+127:i+96] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+127:i+96] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+127:i+96] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +_MM_SWIZ_REG_DACB: + FOR j := 0 to 3 + i := j*128 + IF k[j*4] + dst[i+31:i] := v[i+63:i+32] + ELSE + dst[i+31:i] := src[i+31:i] + FI + IF k[j*4+1] + dst[i+63:i+32] := v[i+95:i+64] + ELSE + dst[i+63:i+32] := src[i+63:i+32] + FI + IF k[j*4+2] + dst[i+95:i+64] := v[i+31:i] + ELSE + dst[i+95:i+64] := src[i+95:i+64] + FI + IF k[j*4+3] + dst[i+127:i+96] := v[i+127:i+96] + ELSE + dst[i+127:i+96] := src[i+127:i+96] + FI + ENDFOR +ESAC +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + + + Performs a swizzle transformation of each of the four groups of packed 4x64-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + CASE s OF +_MM_SWIZ_REG_NONE: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_DCBA: + dst[511:0] := v[511:0] +_MM_SWIZ_REG_CDAB: + FOR j := 0 to 3 + i := j*64 + IF k[j*2] + dst[i+63:i] := v[i+127:i+64] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*2+1] + dst[i+127:i+64] := v[i+63:i] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + ENDFOR +_MM_SWIZ_REG_BADC: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+191:i+128] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+255:i+192] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+63:i] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+127:i+64] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +_MM_SWIZ_REG_AAAA: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+63:i] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+63:i] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+63:i] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR _MM_SWIZ_REG_BBBB: FOR j := 0 to 1 i := j*256 IF k[j*4] dst[i+63:i] := v[i+127:i+63] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+127:i+63] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+127:i+63] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+127:i+63] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +_MM_SWIZ_REG_CCCC: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+191:i+128] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+191:i+128] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+191:i+128] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+191:i+128] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +_MM_SWIZ_REG_DDDD: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+255:i+192] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+255:i+192] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+255:i+192] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+255:i+192] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +_MM_SWIZ_REG_DACB: + FOR j := 0 to 1 + i := j*256 + IF k[j*4] + dst[i+63:i] := v[i+127:i+64] + ELSE + dst[i+63:i] := src[i+63:i] + FI + IF k[j*4+1] + dst[i+127:i+64] := v[i+191:i+128] + ELSE + dst[i+127:i+64] := src[i+127:i+64] + FI + IF k[j*4+2] + dst[i+191:i+128] := v[i+63:i] + ELSE + dst[i+191:i+128] := src[i+191:i+128] + FI + IF k[j*4+3] + dst[i+255:i+192] := v[i+255:i+192] + ELSE + dst[i+255:i+192] := src[i+255:i+192] + FI + ENDFOR +ESAC +dst[MAX:512] := 0 + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + Permutes 128-bit blocks of the packed 32-bit integer vector "a" using constant "imm8". The results are stored in "dst". + +DEFINE SELECT4(src, control) { + CASE control[1:0] OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +FOR j := 0 to 3 + i := j*128 + n := j*2 + dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + + + Permutes 128-bit blocks of the packed 32-bit integer vector "a" using constant "imm8". The results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE control[1:0] OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp[511:0] := 0 +FOR j := 0 to 3 + i := j*128 + n := j*2 + tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) +ENDFOR +FOR j := 0 to 15 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + + + Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in "a" using constant "imm8". The results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE control[1:0] OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp[511:0] := 0 +FOR j := 0 to 3 + i := j*128 + n := j*2 + tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) +ENDFOR +FOR j := 0 to 15 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Swizzle +
+ + + + + + + Downconverts packed single-precision (32-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal. + +addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + CASE conv OF + _MM_DOWNCONV_PS_NONE: + addr[i+31:i] := v[i+31:i] + _MM_DOWNCONV_PS_FLOAT16: + n := j*16 + addr[n+15:n] := Convert_FP32_To_FP16(v[i+31:i]) + _MM_DOWNCONV_PS_UINT8: + n := j*8 + addr[n+7:n] := Convert_FP32_To_UInt8(v[i+31:i]) + _MM_DOWNCONV_PS_SINT8: + n := j*8 + addr[n+7:n] := Convert_FP32_To_Int8(v[i+31:i]) + _MM_DOWNCONV_PS_UINT16: + n := j*16 + addr[n+15:n] := Convert_FP32_To_UInt16(v[i+31:i]) + _MM_DOWNCONV_PS_SINT16: + n := j*16 + addr[n+15:n] := Convert_FP32_To_Int16(v[i+31:i]) + ESAC +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Downconverts packed 32-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + CASE conv OF + _MM_DOWNCONV_EPI32_NONE: + addr[i+31:i] := v[i+31:i] + _MM_DOWNCONV_EPI32_UINT8: + n := j*8 + addr[n+7:n] := Int32ToUInt8(v[i+31:i]) + _MM_DOWNCONV_EPI32_SINT8: + n := j*8 + addr[n+7:n] := Int32ToSInt8(v[i+31:i]) + _MM_DOWNCONV_EPI32_UINT16: + n := j*16 + addr[n+15:n] := Int32ToUInt16(v[i+31:i]) + _MM_DOWNCONV_EPI32_SINT16: + n := j*16 + addr[n+15:n] := Int32ToSInt16(v[i+31:i]) + ESAC +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Downconverts packed double-precision (64-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal. + +addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + CASE conv OF + _MM_DOWNCONV_PS_NONE: + addr[i+63:i] := v[i+63:i] + ESAC +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Downconverts packed 64-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal. + +addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i] + ESAC +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Downconverts packed single-precision (32-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" using writemask "k" (elements are not written to memory when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + CASE conv OF + _MM_DOWNCONV_PS_NONE: + mt[i+31:i] := v[i+31:i] + _MM_DOWNCONV_PS_FLOAT16: + n := j*16 + mt[n+15:n] := Convert_FP32_To_FP16(v[i+31:i]) + _MM_DOWNCONV_PS_UINT8: + n := j*8 + mt[n+7:n] := Convert_FP32_To_UInt8(v[i+31:i]) + _MM_DOWNCONV_PS_SINT8: + n := j*8 + mt[n+7:n] := Convert_FP32_To_Int8(v[i+31:i]) + _MM_DOWNCONV_PS_UINT16: + n := j*16 + mt[n+15:n] := Convert_FP32_To_UInt16(v[i+31:i]) + _MM_DOWNCONV_PS_SINT16: + n := j*16 + mt[n+15:n] := Convert_FP32_To_Int16(v[i+31:i]) + ESAC + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Downconverts packed double-precision (64-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + +addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + CASE conv OF + _MM_DOWNCONV_PD_NONE: + IF k[j] + mt[i+63:i] := v[i+63:i] + FI + ESAC +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Downconverts packed 32-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + IF k[j] + CASE conv OF + _MM_DOWNCONV_EPI32_NONE: + addr[i+31:i] := v[i+31:i] + _MM_DOWNCONV_EPI32_UINT8: + n := j*8 + addr[n+7:n] := Int32ToUInt8(v[i+31:i]) + _MM_DOWNCONV_EPI32_SINT8: + n := j*8 + addr[n+7:n] := Int32ToSInt8(v[i+31:i]) + _MM_DOWNCONV_EPI32_UINT16: + n := j*16 + addr[n+15:n] := Int32ToUInt16(v[i+31:i]) + _MM_DOWNCONV_EPI32_SINT16: + n := j*16 + addr[n+15:n] := Int32ToSInt16(v[i+31:i]) + ESAC + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Downconverts packed 64-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + +addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + IF k[j] + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i] + ESAC + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed single-precision (32-bit) floating-point elements from "v" to memory address "mt" with a no-read hint to the processor. + +addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + addr[i+31:i] := v[i+31:i] +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed double-precision (64-bit) floating-point elements from "v" to memory address "mt" with a no-read hint to the processor. + +addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + addr[i+63:i] := v[i+63:i] +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed single-precision (32-bit) floating-point elements from "v" to memory address "mt" with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them). + +addr := MEM[mt] +FOR j := 0 to 15 + i := j*32 + addr[i+31:i] := v[i+31:i] +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed double-precision (64-bit) floating-point elements from "v" to memory address "mt" with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them). + +addr := MEM[mt] +FOR j := 0 to 7 + i := j*64 + addr[i+63:i] := v[i+63:i] +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. + DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_DOWNCONV_EPI32_NONE: + RETURN element[31:0] + _MM_DOWNCONV_EPI32_UINT8: + RETURN Truncate8(element[31:0]) + _MM_DOWNCONV_EPI32_SINT8: + RETURN Saturate8(element[31:0]) + _MM_DOWNCONV_EPI32_UINT16: + RETURN Truncate16(element[31:0]) + _MM_DOWNCONV_EPI32_SINT16: + RETURN Saturate16(element[31:0]) + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_DOWNCONV_EPI32_NONE: + RETURN 4 + _MM_DOWNCONV_EPI32_UINT8: + RETURN 1 + _MM_DOWNCONV_EPI32_SINT8: + RETURN 1 + _MM_DOWNCONV_EPI32_UINT16: + RETURN 2 + _MM_DOWNCONV_EPI32_SINT16: + RETURN 2 + ESAC +} +storeOffset := 0 +foundNext64BytesBoundary := false +downSize := DOWNCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 15 + IF foundNext64BytesBoundary == false + IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + tmp := DOWNCONVERT(v1[i+31:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 4: MEM[storeAddr] := tmp[31:0] + 2: MEM[storeAddr] := tmp[15:0] + 1: MEM[storeAddr] := tmp[7:0] + ESAC + FI + storeOffset := storeOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresonding mask bit is not set). + DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_DOWNCONV_EPI32_NONE: + RETURN element[31:0] + _MM_DOWNCONV_EPI32_UINT8: + RETURN Truncate8(element[31:0]) + _MM_DOWNCONV_EPI32_SINT8: + RETURN Saturate8(element[31:0]) + _MM_DOWNCONV_EPI32_UINT16: + RETURN Truncate16(element[31:0]) + _MM_DOWNCONV_EPI32_SINT16: + RETURN Saturate16(element[31:0]) + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_DOWNCONV_EPI32_NONE: + RETURN 4 + _MM_DOWNCONV_EPI32_UINT8: + RETURN 1 + _MM_DOWNCONV_EPI32_SINT8: + RETURN 1 + _MM_DOWNCONV_EPI32_UINT16: + RETURN 2 + _MM_DOWNCONV_EPI32_SINT16: + RETURN 2 + ESAC +} +storeOffset := 0 +foundNext64BytesBoundary := false +downSize := DOWNCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 15 + IF k[j] + IF foundNext64BytesBoundary == false + IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + tmp := DOWNCONVERT(v1[i+31:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 4: MEM[storeAddr] := tmp[31:0] + 2: MEM[storeAddr] := tmp[15:0] + 1: MEM[storeAddr] := tmp[7:0] + ESAC + FI + storeOffset := storeOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. + +DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_DOWNCONV_EPI32_NONE: + RETURN element[31:0] + _MM_DOWNCONV_EPI32_UINT8: + RETURN Truncate8(element[31:0]) + _MM_DOWNCONV_EPI32_SINT8: + RETURN Saturate8(element[31:0]) + _MM_DOWNCONV_EPI32_UINT16: + RETURN Truncate16(element[31:0]) + _MM_DOWNCONV_EPI32_SINT16: + RETURN Saturate16(element[31:0]) + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_DOWNCONV_EPI32_NONE: + RETURN 4 + _MM_DOWNCONV_EPI32_UINT8: + RETURN 1 + _MM_DOWNCONV_EPI32_SINT8: + RETURN 1 + _MM_DOWNCONV_EPI32_UINT16: + RETURN 2 + _MM_DOWNCONV_EPI32_SINT16: + RETURN 2 + ESAC +} +storeOffset := 0 +downSize := DOWNCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 15 + i := j*32 + tmp := DOWNCONVERT(v1[i+31:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 4: MEM[storeAddr] := tmp[31:0] + 2: MEM[storeAddr] := tmp[15:0] + 1: MEM[storeAddr] := tmp[7:0] + ESAC + storeOffset := storeOffset + 1 + IF ((addr + storeOffset * downSize) % 64) == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are written to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_DOWNCONV_EPI32_NONE: + RETURN element[31:0] + _MM_DOWNCONV_EPI32_UINT8: + RETURN Truncate8(element[31:0]) + _MM_DOWNCONV_EPI32_SINT8: + RETURN Saturate8(element[31:0]) + _MM_DOWNCONV_EPI32_UINT16: + RETURN Truncate16(element[31:0]) + _MM_DOWNCONV_EPI32_SINT16: + RETURN Saturate16(element[31:0]) + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_DOWNCONV_EPI32_NONE: + RETURN 4 + _MM_DOWNCONV_EPI32_UINT8: + RETURN 1 + _MM_DOWNCONV_EPI32_SINT8: + RETURN 1 + _MM_DOWNCONV_EPI32_UINT16: + RETURN 2 + _MM_DOWNCONV_EPI32_SINT16: + RETURN 2 + ESAC +} +storeOffset := 0 +downSize := DOWNCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 15 + IF k[j] + i := j*32 + tmp := DOWNCONVERT(v1[i+31:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 4: MEM[storeAddr] := tmp[31:0] + 2: MEM[storeAddr] := tmp[15:0] + 1: MEM[storeAddr] := tmp[7:0] + ESAC + storeOffset := storeOffset + 1 + IF ((addr + storeOffset * downSize) % 64) == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. + DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_EPI64_NONE: + RETURN element[63:0] + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_EPI64_NONE: + RETURN 8 + ESAC +} +storeOffset := 0 +foundNext64BytesBoundary := false +downSize := DOWNCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 7 + IF foundNext64BytesBoundary == false + IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + tmp := DOWNCONVERT(v1[i+63:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 8: MEM[storeAddr] := tmp[63:0] + ESAC + FI + storeOffset := storeOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (mt-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresonding mask bit is not set). + DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_EPI64_NONE: + RETURN element[63:0] + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_EPI64_NONE: + RETURN 8 + ESAC +} +storeOffset := 0 +foundNext64BytesBoundary := false +downSize := DOWNCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 7 + IF k[j] + IF foundNext64BytesBoundary == false + IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*64 + tmp := DOWNCONVERT(v1[i+63:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 8: MEM[storeAddr] := tmp[63:0] + ESAC + FI + storeOffset := storeOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. + +DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_EPI64_NONE: + RETURN element[63:0] + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_EPI64_NONE: + RETURN 8 + ESAC +} +storeOffset := 0 +downSize := DOWNCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 7 + i := j*63 + tmp := DOWNCONVERT(v1[i+63:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 8: MEM[storeAddr] := tmp[63:0] + ESAC + storeOffset := storeOffset + 1 + IF ((addr + storeOffset * downSize) % 64) == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped whent he corresponding mask bit is not set). + +DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_EPI64_NONE: + RETURN element[63:0] + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_EPI64_NONE: + RETURN 8 + ESAC +} +storeOffset := 0 +downSize := DOWNCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 7 + IF k[j] + i := j*63 + tmp := DOWNCONVERT(v1[i+63:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 8: MEM[storeAddr] := tmp[63:0] + ESAC + storeOffset := storeOffset + 1 + IF ((addr + storeOffset * downSize) % 64) == 0 + BREAK + FI + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. + DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_PS_NONE: + RETURN element[31:0] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP32_To_FP16(element[31:0]) + _MM_UPCONV_PS_UINT8: + RETURN Truncate8(element[31:0]) + _MM_UPCONV_PS_SINT8: + RETURN Saturate8(element[31:0]) + _MM_UPCONV_PS_UINT16: + RETURN Truncate16(element[31:0]) + _MM_UPCONV_PS_SINT16: + RETURN Saturate16(element[31:0]) + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_PS_NONE: + RETURN 4 + _MM_UPCONV_PS_FLOAT16: + RETURN 2 + _MM_UPCONV_PS_UINT8: + RETURN 1 + _MM_UPCONV_PS_SINT8: + RETURN 1 + _MM_UPCONV_PS_UINT16: + RETURN 2 + _MM_UPCONV_PS_SINT16: + RETURN 2 + ESAC +} +storeOffset := 0 +foundNext64BytesBoundary := false +downSize := DOWNCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 15 + IF foundNext64BytesBoundary == false + IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + tmp := DOWNCONVERT(v1[i+31:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 4: MEM[storeAddr] := tmp[31:0] + 2: MEM[storeAddr] := tmp[15:0] + 1: MEM[storeAddr] := tmp[7:0] + ESAC + FI + storeOffset := storeOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_PS_NONE: + RETURN element[31:0] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP32_To_FP16(element[31:0]) + _MM_UPCONV_PS_UINT8: + RETURN Truncate8(element[31:0]) + _MM_UPCONV_PS_SINT8: + RETURN Saturate8(element[31:0]) + _MM_UPCONV_PS_UINT16: + RETURN Truncate16(element[31:0]) + _MM_UPCONV_PS_SINT16: + RETURN Saturate16(element[31:0]) + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_PS_NONE: + RETURN 4 + _MM_UPCONV_PS_FLOAT16: + RETURN 2 + _MM_UPCONV_PS_UINT8: + RETURN 1 + _MM_UPCONV_PS_SINT8: + RETURN 1 + _MM_UPCONV_PS_UINT16: + RETURN 2 + _MM_UPCONV_PS_SINT16: + RETURN 2 + ESAC +} +storeOffset := 0 +foundNext64BytesBoundary := false +downSize := DOWNCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 15 + IF k[j] + IF foundNext64BytesBoundary == false + IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 + foundNext64BytesBoundary := true + FI + ELSE + i := j*32 + tmp := DOWNCONVERT(v1[i+31:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 4: MEM[storeAddr] := tmp[31:0] + 2: MEM[storeAddr] := tmp[15:0] + 1: MEM[storeAddr] := tmp[7:0] + ESAC + FI + storeOffset := storeOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. + +DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_PS_NONE: + RETURN element[31:0] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP32_To_FP16(element[31:0]) + _MM_UPCONV_PS_UINT8: + RETURN Truncate8(element[31:0]) + _MM_UPCONV_PS_SINT8: + RETURN Saturate8(element[31:0]) + _MM_UPCONV_PS_UINT16: + RETURN Truncate16(element[31:0]) + _MM_UPCONV_PS_SINT16: + RETURN Saturate16(element[31:0]) + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_PS_NONE: + RETURN 4 + _MM_UPCONV_PS_FLOAT16: + RETURN 2 + _MM_UPCONV_PS_UINT8: + RETURN 1 + _MM_UPCONV_PS_SINT8: + RETURN 1 + _MM_UPCONV_PS_UINT16: + RETURN 2 + _MM_UPCONV_PS_SINT16: + RETURN 2 + ESAC +} +storeOffset := 0 +downSize := DOWNCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 15 + i := j*32 + tmp := DOWNCONVERT(v1[i+31:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 4: MEM[storeAddr] := tmp[31:0] + 2: MEM[storeAddr] := tmp[15:0] + 1: MEM[storeAddr] := tmp[7:0] + ESAC + storeOffset := storeOffset + 1 + IF ((addr + storeOffset * downSize) % 64) == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_PS_NONE: + RETURN element[31:0] + _MM_UPCONV_PS_FLOAT16: + RETURN Convert_FP32_To_FP16(element[31:0]) + _MM_UPCONV_PS_UINT8: + RETURN Truncate8(element[31:0]) + _MM_UPCONV_PS_SINT8: + RETURN Saturate8(element[31:0]) + _MM_UPCONV_PS_UINT16: + RETURN Truncate16(element[31:0]) + _MM_UPCONV_PS_SINT16: + RETURN Saturate16(element[31:0]) + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_PS_NONE: + RETURN 4 + _MM_UPCONV_PS_FLOAT16: + RETURN 2 + _MM_UPCONV_PS_UINT8: + RETURN 1 + _MM_UPCONV_PS_SINT8: + RETURN 1 + _MM_UPCONV_PS_UINT16: + RETURN 2 + _MM_UPCONV_PS_SINT16: + RETURN 2 + ESAC +} +storeOffset := 0 +downSize := DOWNCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 15 + IF k[j] + i := j*32 + tmp := DOWNCONVERT(v1[i+31:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 4: MEM[storeAddr] := tmp[31:0] + 2: MEM[storeAddr] := tmp[15:0] + 1: MEM[storeAddr] := tmp[7:0] + ESAC + storeOffset := storeOffset + 1 + IF ((addr + storeOffset * downSize) % 64) == 0 + BREAK FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+127:i+63] - ELSE - dst[i+127:i+64] := src[i+127:i+64] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. + DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_PD_NONE: + RETURN element[63:0] + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_PD_NONE: + RETURN 8 + ESAC +} +storeOffset := 0 +foundNext64BytesBoundary := false +downSize := DOWNCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 7 + IF foundNext64BytesBoundary == false + IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 + foundNext64BytesBoundary := true FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+127:i+63] + ELSE + i := j*64 + tmp := DOWNCONVERT(v1[i+63:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 8: MEM[storeAddr] := tmp[63:0] + ESAC + FI + storeOffset := storeOffset + 1 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_PD_NONE: + RETURN element[63:0] + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_PD_NONE: + RETURN 8 + ESAC +} +storeOffset := 0 +foundNext64BytesBoundary := false +downSize := DOWNCONVERTSIZE(conv) +addr := mt-64 +FOR j := 0 to 7 + IF k[j] + IF foundNext64BytesBoundary == false + IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 + foundNext64BytesBoundary := true + FI ELSE - dst[i+191:i+128] := src[i+191:i+128] + i := j*64 + tmp := DOWNCONVERT(v1[i+63:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 8: MEM[storeAddr] := tmp[63:0] + ESAC FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+127:i+63] - ELSE - dst[i+255:i+192] := src[i+255:i+192] + storeOffset := storeOffset + 1 + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. + +DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_PD_NONE: + RETURN element[63:0] + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_PD_NONE: + RETURN 8 + ESAC +} +storeOffset := 0 +downSize := DOWNCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 7 + i := j*63 + tmp := DOWNCONVERT(v1[i+63:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 8: MEM[storeAddr] := tmp[63:0] + ESAC + storeOffset := storeOffset + 1 + IF ((addr + storeOffset * downSize) % 64) == 0 + BREAK + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +DEFINE DOWNCONVERT(element, convertTo) { + CASE convertTo OF + _MM_UPCONV_PD_NONE: + RETURN element[63:0] + ESAC +} +DEFINE DOWNCONVERTSIZE(convertTo) { + CASE convertTo OF + _MM_UPCONV_PD_NONE: + RETURN 8 + ESAC +} +storeOffset := 0 +downSize := DOWNCONVERTSIZE(conv) +addr := mt +FOR j := 0 to 7 + IF k[j] + i := j*63 + tmp := DOWNCONVERT(v1[i+63:i], conv) + storeAddr := addr + storeOffset * downSize + CASE downSize OF + 8: MEM[storeAddr] := tmp[63:0] + ESAC + storeOffset := storeOffset + 1 + IF ((addr + storeOffset * downSize) % 64) == 0 + BREAK FI - ENDFOR -_MM_SWIZ_REG_CCCC: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+191:i+128] - ELSE - dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements whose corresponding mask bit is not set are not written to memory). + +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)). + +storeOffset := 0 +foundNext64BytesBoundary := 0 +addr := mt-64 +FOR j := 0 to 15 + IF foundNext64BytesBoundary == 0 + IF ((addr + (storeOffset + 1)*4) % 64) == 0 + foundNext64BytesBoundary := 1 FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+191:i+128] + ELSE + i := j*32 + MEM[addr + storeOffset*4] := v1[i+31:i] + FI + storeOffset := storeOffset + 1 +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +storeOffset := 0 +foundNext64BytesBoundary := 0 +addr := mt-64 +FOR j := 0 to 15 + IF k[j] + IF foundNext64BytesBoundary == 0 + IF ((addr + (storeOffset + 1)*4) % 64) == 0 + foundNext64BytesBoundary := 1 + FI ELSE - dst[i+127:i+64] := src[i+127:i+64] + i := j*32 + MEM[addr + storeOffset*4] := v1[i+31:i] FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+191:i+128] - ELSE - dst[i+191:i+128] := src[i+191:i+128] + storeOffset := storeOffset + 1 + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). + +storeOffset := 0 +addr := mt +FOR j := 0 to 15 + i := j*32 + MEM[addr + storeOffset*4] := v1[i+31:i] + storeOffset := storeOffset + 1 + IF ((addr + storeOffset*4) % 64) == 0 + BREAK + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +storeOffset := 0 +addr := mt +FOR j := 0 to 15 + IF k[j] + i := j*32 + MEM[addr + storeOffset*4] := v1[i+31:i] + storeOffset := storeOffset + 1 + IF ((addr + storeOffset*4) % 64) == 0 + BREAK FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+191:i+128] - ELSE - dst[i+255:i+192] := src[i+255:i+192] + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). + +storeOffset := 0 +foundNext64BytesBoundary := 0 +addr := mt-64 +FOR j := 0 to 7 + IF foundNext64BytesBoundary == 0 + IF ((addr + (storeOffset + 1)*8) % 64) == 0 + foundNext64BytesBoundary := 1 FI - ENDFOR -_MM_SWIZ_REG_DDDD: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+255:i+192] + ELSE + i := j*64 + MEM[addr + storeOffset*8] := v1[i+63:i] + FI + storeOffset := storeOffset + 1 +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +storeOffset := 0 +foundNext64BytesBoundary := 0 +addr := mt-64 +FOR j := 0 to 7 + IF k[j] + IF foundNext64BytesBoundary == 0 + IF ((addr + (storeOffset + 1)*8) % 64) == 0 + foundNext64BytesBoundary := 1 + FI ELSE - dst[i+63:i] := src[i+63:i] + i := j*64 + MEM[addr + storeOffset*8] := v1[i+63:i] FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+255:i+192] - ELSE - dst[i+127:i+64] := src[i+127:i+64] + storeOffset := storeOffset + 1 + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). + +storeOffset := 0 +addr := mt +FOR j := 0 to 7 + i := j*64 + MEM[addr + storeOffset*8] := v1[i+63:i] + storeOffset := storeOffset + 1 + IF ((addr + storeOffset*8) % 64) == 0 + BREAK + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +storeOffset := 0 +addr := mt +FOR j := 0 to 7 + IF k[j] + i := j*64 + MEM[addr + storeOffset*8] := v1[i+63:i] + storeOffset := storeOffset + 1 + IF ((addr + storeOffset*8) % 64) == 0 + BREAK FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+255:i+192] - ELSE - dst[i+191:i+128] := src[i+191:i+128] + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). + +storeOffset := 0 +foundNext64BytesBoundary := 0 +addr := mt-64 +FOR j := 0 to 15 + IF foundNext64BytesBoundary == 0 + IF ((addr + (storeOffset + 1)*4) % 64) == 0 + foundNext64BytesBoundary := 1 FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+255:i+192] + ELSE + i := j*32 + MEM[addr + storeOffset*4] := v1[i+31:i] + FI + storeOffset := storeOffset + 1 +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +storeOffset := 0 +foundNext64BytesBoundary := 0 +addr := mt-64 +FOR j := 0 to 15 + IF k[j] + IF foundNext64BytesBoundary == 0 + IF ((addr + (storeOffset + 1)*4) % 64) == 0 + foundNext64BytesBoundary := 1 + FI ELSE - dst[i+255:i+192] := src[i+255:i+192] + i := j*32 + MEM[addr + storeOffset*4] := v1[i+31:i] FI - ENDFOR -_MM_SWIZ_REG_DACB: - FOR j := 0 to 1 - i := j*256 - IF k[j*4] - dst[i+63:i] := v[i+127:i+64] - ELSE - dst[i+63:i] := src[i+63:i] + storeOffset := storeOffset + 1 + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). + +storeOffset := 0 +addr := mt +FOR j := 0 to 15 + i := j*32 + MEM[addr + storeOffset*4] := v1[i+31:i] + storeOffset := storeOffset + 1 + IF ((addr + storeOffset*4) % 64) == 0 + BREAK + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +storeOffset := 0 +addr := mt +FOR j := 0 to 15 + IF k[j] + i := j*32 + MEM[addr + storeOffset*4] := v1[i+31:i] + storeOffset := storeOffset + 1 + IF ((addr + storeOffset*4) % 64) == 0 + BREAK FI - IF k[j*4+1] - dst[i+127:i+64] := v[i+191:i+128] - ELSE - dst[i+127:i+64] := src[i+127:i+64] + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). + +storeOffset := 0 +foundNext64BytesBoundary := 0 +addr := mt-64 +FOR j := 0 to 7 + IF foundNext64BytesBoundary == 0 + IF ((addr + (storeOffset + 1)*8) % 64) == 0 + foundNext64BytesBoundary := 1 FI - IF k[j*4+2] - dst[i+191:i+128] := v[i+63:i] + ELSE + i := j*64 + MEM[addr + storeOffset*4] := v1[i+63:i] + FI + storeOffset := storeOffset + 1 +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +storeOffset := 0 +foundNext64BytesBoundary := 0 +addr := mt-64 +FOR j := 0 to 7 + IF k[j] + IF foundNext64BytesBoundary == 0 + IF ((addr + (storeOffset + 1)*8) % 64) == 0 + foundNext64BytesBoundary := 1 + FI ELSE - dst[i+191:i+128] := src[i+191:i+128] + i := j*64 + MEM[addr + storeOffset*4] := v1[i+63:i] FI - IF k[j*4+3] - dst[i+255:i+192] := v[i+255:i+192] - ELSE - dst[i+255:i+192] := src[i+255:i+192] + storeOffset := storeOffset + 1 + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). + +storeOffset := 0 +addr := mt +FOR j := 0 to 7 + i := j*64 + MEM[addr + storeOffset*8] := v1[i+63:i] + storeOffset := storeOffset + 1 + IF ((addr + storeOffset*8) % 64) == 0 + BREAK + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + +storeOffset := 0 +addr := mt +FOR j := 0 to 7 + IF k[j] + i := j*64 + MEM[addr + storeOffset*8] := v1[i+63:i] + storeOffset := storeOffset + 1 + IF ((addr + storeOffset*8) % 64) == 0 + BREAK FI - ENDFOR -ESAC + FI +ENDFOR + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + + Down-converts 8 packed single-precision (32-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) + _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) + _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) + _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) + _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) + ESAC +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + + + Down-converts 8 packed single-precision (32-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Elements are only written when the corresponding mask bit is set in "k"; otherwise, elements are unchanged in memory. "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) + _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) + _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) + _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) + _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) + ESAC + FI +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + + Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + + + Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Elements are written to memory using writemask "k" (elements are not stored to memory when the corresponding mask bit is not set; the memory location is left unchagned). "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC + FI +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + + Down-converts the low 8 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+31:i]) + ESAC +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + + + Down-converts the low 8 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Elements are written to memory using writemask "k" (elements are only written when the corresponding mask bit is set; otherwise, the memory location is left unchanged). "hint" indicates to the processor whether the data is non-temporal. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+31:i]) + ESAC + FI +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + + Down-converts 8 packed 64-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the load is non-temporal. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + + + Down-converts 8 packed 64-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC + FI +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in "a" using constant "imm8". The results are stored in "dst". + +DEFINE SELECT4(src, control) { + CASE control[1:0] OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +FOR j := 0 to 3 + i := j*128 + n := j*2 + dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Stores 8 packed single-precision (32-bit) floating-point elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Stores 8 packed single-precision (32-bit) floating-point elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are only written to memory when the corresponding mask bit is set). + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Stores 8 packed 32-bit integer elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Stores 8 packed 32-bit integer elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are only written to memory when the corresponding mask bit is set). + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Performs element-by-element addition of packed 32-bit integers in "v2" and "v3" and the corresponding bit in "k2", storing the result of the addition in "dst" and the result of the carry in "k2_res". + FOR j := 0 to 15 + i := j*32 + k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j]) + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element-by-element addition of packed 32-bit integers in "v2" and "v3" and the corresponding bit in "k2", storing the result of the addition in "dst" and the result of the carry in "k2_res" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k1[j] + k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j]) + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j] + ELSE + dst[i+31:i] := v2[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Performs element by element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element by element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Performs element by element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element by element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := v3[i+63:i] - v2[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := v3[i+63:i] - v2[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v3[i+31:i] - v2[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v3[i+31:i] - v2[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := v3[i+63:i] - v2[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := v3[i+63:i] - v2[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v3[i+31:i] - v2[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v3[i+31:i] - v2[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3" storing the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v3[i+31:i] - v2[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set) + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v3[i+31:i] - v2[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Performs element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the resultant carry in "k2_res" (carry flag) and the addition results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the resultant carry in "k2_res" (carry flag) and the addition results in "dst" using writemask "k" (elements are copied from "v2" and "k_old" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + ELSE + dst[i+31:i] := v2[i+31:i] + k2_res[j] := k_old[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Performs an element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs an element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + + Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Performs element-by-element subtraction of packed 32-bit integer elements in "v3" from "v2", storing the results in "dst" and the nth borrow bit in the nth position of "borrow" (borrow flag). + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v2[i+31:i] - v3[i+31:i] + borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element-by-element subtraction of packed 32-bit integer elements in "v3" from "v2", storing the results in "dst" and the nth borrow bit in the nth position of "borrow" (borrow flag). Results are stored using writemask "k" (elements are copied from "v2" and "k_old" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v2[i+31:i] - v3[i+31:i] + borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i]) + ELSE + dst[i+31:i] := v3[i+31:i] + borrow[j] := k_old[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3", storing the results in "dst" and "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v3[i+31:i] - v2[i+31:i] + borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3", storing the results in "dst" and "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are written using writemask "k" (elements are copied from "k" to "k_old" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + diff := v3[i+31:i] - v2[i+31:i] + borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i]) + dst[i+31:i] := diff + v2[i+31:i] := diff + ELSE + borrow[j] := k_old[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v3" as well as the corresponding bit from "k" from "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k[j] + borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v3" as well as the corresponding bit from "k2" from "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k1[j] + dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k2[j] + borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j]) + ELSE + dst[i+31:i] := v2[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v2" as well as the corresponding bit from "k" from "v3". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k[j] + borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v2" as well as the corresponding bit from "k2" from "v3". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k1[j] + dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k2[j] + borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j]) + ELSE + dst[i+31:i] := v2[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed 32-bit integer elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + base := (j & ~0x3) * 32 + scale[31:0] := b[base+63:base+32] + bias[31:0] := b[base+31:base] + dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed 32-bit integer elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + base := (j & ~0x3) * 32 + scale[31:0] := b[base+63:base+32] + bias[31:0] := b[base+31:base] + dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + base := (j & ~0x3) * 32 + scale[31:0] := b[base+63:base+32] + bias[31:0] := b[base+31:base] + dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + base := (j & ~0x3) * 32 + scale[31:0] := b[base+63:base+32] + bias[31:0] := b[base+31:base] + dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Performs element-by-element multiplication between packed 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs element-by-element multiplication between packed 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Performs element-by-element multiplication between packed unsigned 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Performs element-by-element multiplication between packed unsigned 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] * POW(2.0, FP32(b[i+31:i])) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * POW(2.0, FP32(b[i+31:i])) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst". Intermediate elements are rounded using "rounding". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] * POW(2.0,FP32(b[i+31:i])) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exp, where the exp is the corresponding 32-bit integer element in "b", storing results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are rounded using constant "rounding". + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * POW(2.0, FP32(b[i+31:i])) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + base := (j & ~0x3) * 32 + scale[31:0] := b[base+63:base+32] + bias[31:0] := b[base+31:base] + dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + base := (j & ~0x3) * 32 + scale[31:0] := b[base+63:base+32] + bias[31:0] := b[base+31:base] + dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Arithmetic +
+ + + + + Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed single-precision (32-bit) floating-point elements, storing the results in "dst". Results are written to the lower half of "dst", and the upper half locations are set to '0'. + [round_note] + +FOR j := 0 to 7 + i := j*64 + k := j*32 + dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + + Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed single-precision (32-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are written to the lower half of "dst", and the upper half locations are set to '0'. + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements, storing the results in "dst". Results are written to the lower half of "dst", and the upper half locations are set to '0'. + [round_note] + +FOR j := 0 to 7 + i := j*64 + k := j*32 + dst[k+31:k] := Convert_FP64_To_Int32(v2[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + + Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are written to the lower half of "dst", and the upper half locations are set to '0'. + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_FP64_To_Int32(v2[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in "v2" to packed 32-bit integer elements and performs an optional exponent adjust using "expadj", storing the results in "dst". + [round_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v2[i+31:i] + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC + dst[i+31:i] := Float32ToInt32(dst[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements and performing an optional exponent adjust using "expadj", storing the results in "dst". + [round_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := v2[i+31:i] + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC + dst[i+31:i] := Float32ToUInt32(dst[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of packed 32-bit unsigned integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst". + [round_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := UInt32ToFloat32(v2[i+31:i]) + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + + + Performs element-by-element conversion of packed 32-bit unsigned integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := Int32ToFloat32(v2[i+31:i]) + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector "v2" to 32-bit integer elements, storing them in the lower half of "dst". The elements in the upper half of "dst" are set to 0. + [round_note] + +FOR j := 0 to 7 + i := j*64 + k := j*32 + dst[k+31:k] := Convert_FP64_To_Int32(v2[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + + Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector "v2" to 32-bit integer elements, storing them in the lower half of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements in the upper half of "dst" are set to 0. + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_FP64_To_Int32(v2[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of packed 32-bit integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst". + [round_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := Int32ToFloat32(v2[i+31:i]) + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value using "expadj" and in the direction of "rounding", and store the results as packed single-precision floating-point elements in "dst". + [round_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ROUND(a[i+31:i]) + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + + + + + Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value using "expadj" and in the direction of "rounding", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ROUND(a[i+31:i]) + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Convert +
+ + + + Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in "v2" with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := exp223(v2[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in "v2" with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := exp223(v2[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a" with absolute error of 2^(-23) and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a" with absolute error of 2^(-23) and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of precision, storing the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of precision, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of accuracy and stores the result in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := Sqrt(1.0 / a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of accuracy and stores the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := Sqrt(1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Fixes up NaN's from packed double-precision (64-bit) floating-point elements in "v1" and "v2", storing the results in "dst" and storing the quietized NaN's from "v1" in "v3". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i]) + v3[i+63:i] := QuietizeNaNs(v1[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fixes up NaN's from packed double-precision (64-bit) floating-point elements in "v1" and "v2", storing the results in "dst" using writemask "k" (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from "v1" are stored in "v3". + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i]) + v3[i+63:i] := QuietizeNaNs(v1[i+63:i]) + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Miscellaneous +
+ + + + + + Fixes up NaN's from packed single-precision (32-bit) floating-point elements in "v1" and "v2", storing the results in "dst" and storing the quietized NaN's from "v1" in "v3". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i]) + v3[i+31:i] := QuietizeNaNs(v1[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fixes up NaN's from packed single-precision (32-bit) floating-point elements in "v1" and "v2", storing the results in "dst" using writemask "k" (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from "v1" are stored in "v3". + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i]) + v3[i+31:i] := QuietizeNaNs(v1[i+31:i]) + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Miscellaneous +
+ + + + + + Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst". + [round_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ROUND(a[i+31:i]) + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ROUND(a[i+31:i]) + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Miscellaneous +
+ + + + + + Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst". + [round_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ROUND(a[i+63:i]) + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ROUND(a[i+63:i]) + CASE expadj OF + _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) + _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) + _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) + _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) + _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) + _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) + _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) + _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) + ESAC + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Miscellaneous +
+ + + + Counts the number of set bits in 32-bit unsigned integer "r1", returning the results in "dst". + dst[31:0] := PopCount(r1[31:0]) + + + KNCNI +
immintrin.h
+ Bit Manipulation +
+ + + + Counts the number of set bits in 64-bit unsigned integer "r1", returning the results in "dst". + dst[63:0] := PopCount(r1[63:0]) + + + KNCNI +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of trailing zero bits in unsigned 32-bit integer "x" starting at bit "a", and return that count in "dst". + +tmp := a +IF tmp < 0 + tmp := 0 +FI +dst := 0 +IF tmp > 31 + dst := 32 +ELSE + DO WHILE ((tmp < 32) AND x[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 + OD +FI + + + KNCNI +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of trailing zero bits in unsigned 64-bit integer "x" starting at bit "a", and return that count in "dst". + +tmp := a +IF tmp < 0 + tmp := 0 +FI +dst := 0 +IF tmp > 63 + dst := 64 +ELSE + DO WHILE ((tmp < 64) AND x[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 + OD +FI + + + KNCNI +
immintrin.h
+ Bit Manipulation +
+ + + + + Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i])) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + + + Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i])) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + Determines the maximum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + + + Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i])) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + + + Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i])) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + Determines the maximum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + + + Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + Determines the minimum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + + + Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + Determines the minimum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + + + Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst". + min := a[31:0] +FOR j := 1 to 15 + i := j*32 + dst := FpMin(min, a[i+31:i]) +ENDFOR +dst := min + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst" using writemask "k" (elements are ignored when the corresponding mask bit is not set). + min := a[31:0] +FOR j := 1 to 15 + i := j*32 + IF k[j] + CONTINUE + ELSE + dst := FpMin(min, a[i+31:i]) + FI +ENDFOR +dst := min + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". + min := a[63:0] +FOR j := 1 to 7 + i := j*64 + dst := FpMin(min, a[i+63:i]) +ENDFOR +dst := min + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set). + min := a[63:0] +FOR j := 1 to 7 + i := j*64 + IF k[j] + CONTINUE + ELSE + dst := FpMin(min, a[i+63:i]) + FI +ENDFOR +dst := min + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst". + max := a[31:0] +FOR j := 1 to 15 + i := j*32 + dst := FpMax(max, a[i+31:i]) +ENDFOR +dst := max + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set). + max := a[31:0] +FOR j := 1 to 15 + i := j*32 + IF k[j] + CONTINUE + ELSE + dst := FpMax(max, a[i+31:i]) + FI +ENDFOR +dst := max + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". + max := a[63:0] +FOR j := 1 to 7 + i := j*64 + dst := FpMax(max, a[i+63:i]) +ENDFOR +dst := max + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set). + max := a[63:0] +FOR j := 1 to 7 + i := j*64 + IF k[j] + CONTINUE + ELSE + dst := FpMax(max, a[i+63:i]) + FI +ENDFOR +dst := max + + KNCNI +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Prefetch single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) + FI +ENDFOR + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. +The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) +ENDFOR + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + + Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. +The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) + FI +ENDFOR + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) +ENDFOR + + + + KNCNI +
immintrin.h
+ Load +
+ + + + + + + + Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint", with a request for exclusive ownership. The "hint" parameter may be one of the following: _MM_HINT_T0 = 1 for prefetching to L1 cache, _MM_HINT_T1 = 2 for prefetching to L2 cache, _MM_HINT_T2 = 3 for prefetching to L2 cache non-temporal, _MM_HINT_NTA = 0 for prefetching to L1 cache non-temporal. The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent scatter intrinsic. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) +ENDFOR + + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + + Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. +The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. Only those elements whose corresponding mask bit in "k" is set are loaded into cache. + +cachev := 0 +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) + FI +ENDFOR + + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) +ENDFOR + + + + KNCNI +
immintrin.h
+ Store +
+ + + + + + + + Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. Only those elements whose corresponding mask bit in "k" is set are loaded into the desired cache. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + Prefetch(MEM[addr+31:addr], hint) + FI +ENDFOR + + + + KNCNI +
immintrin.h
+ Store +
+ + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - Downconverts packed single-precision (32-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal. + Arithmetic + + + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -addr := MEM[mt] FOR j := 0 to 15 i := j*32 - CASE conv OF - _MM_DOWNCONV_PS_NONE: - addr[i+31:i] := v[i+31:i] - _MM_DOWNCONV_PS_FLOAT16: - n := j*16 - addr[n+15:n] := Convert_FP32_To_FP16(v[i+31:i]) - _MM_DOWNCONV_PS_UINT8: - n := j*8 - addr[n+7:n] := Convert_FP32_To_UInt8(v[i+31:i]) - _MM_DOWNCONV_PS_SINT8: - n := j*8 - addr[n+7:n] := Convert_FP32_To_Int8(v[i+31:i]) - _MM_DOWNCONV_PS_UINT16: - n := j*16 - addr[n+15:n] := Convert_FP32_To_UInt16(v[i+31:i]) - _MM_DOWNCONV_PS_SINT16: - n := j*16 - addr[n+15:n] := Convert_FP32_To_Int16(v[i+31:i]) - ESAC + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - Downconverts packed 32-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] + Arithmetic + + + + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + FOR j := 0 to 15 i := j*32 - CASE conv OF - _MM_DOWNCONV_EPI32_NONE: - addr[i+31:i] := v[i+31:i] - _MM_DOWNCONV_EPI32_UINT8: - n := j*8 - addr[n+7:n] := Int32ToUInt8(v[i+31:i]) - _MM_DOWNCONV_EPI32_SINT8: - n := j*8 - addr[n+7:n] := Int32ToSInt8(v[i+31:i]) - _MM_DOWNCONV_EPI32_UINT16: - n := j*16 - addr[n+15:n] := Int32ToUInt16(v[i+31:i]) - _MM_DOWNCONV_EPI32_SINT16: - n := j*16 - addr[n+15:n] := Int32ToSInt16(v[i+31:i]) - ESAC + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - Downconverts packed double-precision (64-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal. + Arithmetic + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". -addr := MEM[mt] FOR j := 0 to 7 i := j*64 - CASE conv OF - _MM_DOWNCONV_PS_NONE: - addr[i+63:i] := v[i+63:i] - ESAC + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - Downconverts packed 64-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal. + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + [round_note] -addr := MEM[mt] FOR j := 0 to 7 i := j*64 - CASE conv OF - _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i] - ESAC + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - Downconverts packed single-precision (32-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" using writemask "k" (elements are not written to memory when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - CASE conv OF - _MM_DOWNCONV_PS_NONE: - mt[i+31:i] := v[i+31:i] - _MM_DOWNCONV_PS_FLOAT16: - n := j*16 - mt[n+15:n] := Convert_FP32_To_FP16(v[i+31:i]) - _MM_DOWNCONV_PS_UINT8: - n := j*8 - mt[n+7:n] := Convert_FP32_To_UInt8(v[i+31:i]) - _MM_DOWNCONV_PS_SINT8: - n := j*8 - mt[n+7:n] := Convert_FP32_To_Int8(v[i+31:i]) - _MM_DOWNCONV_PS_UINT16: - n := j*16 - mt[n+15:n] := Convert_FP32_To_UInt16(v[i+31:i]) - _MM_DOWNCONV_PS_SINT16: - n := j*16 - mt[n+15:n] := Convert_FP32_To_Int16(v[i+31:i]) - ESAC - FI + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - Downconverts packed double-precision (64-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] -addr := MEM[mt] FOR j := 0 to 7 i := j*64 - CASE conv OF - _MM_DOWNCONV_PD_NONE: - IF k[j] - mt[i+63:i] := v[i+63:i] - FI - ESAC + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - Downconverts packed 32-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. - addr := MEM[mt] -FOR j := 0 to 15 - i := j*32 + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 IF k[j] - CASE conv OF - _MM_DOWNCONV_EPI32_NONE: - addr[i+31:i] := v[i+31:i] - _MM_DOWNCONV_EPI32_UINT8: - n := j*8 - addr[n+7:n] := Int32ToUInt8(v[i+31:i]) - _MM_DOWNCONV_EPI32_SINT8: - n := j*8 - addr[n+7:n] := Int32ToSInt8(v[i+31:i]) - _MM_DOWNCONV_EPI32_UINT16: - n := j*16 - addr[n+15:n] := Int32ToUInt16(v[i+31:i]) - _MM_DOWNCONV_EPI32_SINT16: - n := j*16 - addr[n+15:n] := Int32ToSInt16(v[i+31:i]) - ESAC + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - Downconverts packed 64-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] -addr := MEM[mt] FOR j := 0 to 7 i := j*64 IF k[j] - CASE conv OF - _MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i] - ESAC + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - Stores packed single-precision (32-bit) floating-point elements from "v" to memory address "mt" with a no-read hint to the processor. + Arithmetic + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". -addr := MEM[mt] FOR j := 0 to 15 i := j*32 - addr[i+31:i] := v[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - Stores packed double-precision (64-bit) floating-point elements from "v" to memory address "mt" with a no-read hint to the processor. + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + [round_note] -addr := MEM[mt] -FOR j := 0 to 7 - i := j*64 - addr[i+63:i] := v[i+63:i] +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - Stores packed single-precision (32-bit) floating-point elements from "v" to memory address "mt" with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them). + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -addr := MEM[mt] FOR j := 0 to 15 i := j*32 - addr[i+31:i] := v[i+31:i] + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - Stores packed double-precision (64-bit) floating-point elements from "v" to memory address "mt" with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them). + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] -addr := MEM[mt] -FOR j := 0 to 7 - i := j*64 - addr[i+63:i] := v[i+63:i] +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - Performs element-by-element addition of packed 32-bit integers in "v2" and "v3" and the corresponding bit in "k2", storing the result of the addition in "dst" and the result of the carry in "k2_res". - FOR j := 0 to 15 + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j]) - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j] + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - - Performs element-by-element addition of packed 32-bit integers in "v2" and "v3" and the corresponding bit in "k2", storing the result of the addition in "dst" and the result of the carry in "k2_res" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set). - FOR j := 0 to 15 + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 i := j*32 - IF k1[j] - k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j]) - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j] + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE - dst[i+31:i] := v2[i+31:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst". + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". FOR j := 0 to 7 i := j*64 - dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + [round_note] FOR j := 0 to 7 i := j*64 - IF k[j] - dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst". + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - Performs element by element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst". - [round_note] + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - - Performs element by element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i]) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - Performs element by element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst". - [round_note] + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - - Performs element by element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". [round_note] FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst". + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := v3[i+63:i] - v2[i+63:i] +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] -FOR j := 0 to 7 - i := j*64 +FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+63:i] := v3[i+63:i] - v2[i+63:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst". + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - dst[i+31:i] := v3[i+31:i] - v2[i+31:i] + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := v3[i+31:i] - v2[i+31:i] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst". - [round_note] + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". FOR j := 0 to 7 i := j*64 - dst[i+63:i] := v3[i+63:i] - v2[i+63:i] -ENDFOR + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - - Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 IF k[j] - dst[i+63:i] := v3[i+63:i] - v2[i+63:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE - dst[i+63:i] := src[i+63:i] + dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst". - [round_note] + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := v3[i+31:i] - v2[i+31:i] -ENDFOR +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - - Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := v3[i+31:i] - v2[i+31:i] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3" storing the results in "dst". + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := v3[i+31:i] - v2[i+31:i] -ENDFOR + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set) + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := v3[i+31:i] - v2[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := c[i+31:i] FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - - Performs element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the resultant carry in "k2_res" (carry flag) and the addition results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] - k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i]) -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - - Performs element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the resultant carry in "k2_res" (carry flag) and the addition results in "dst" using writemask "k" (elements are copied from "v2" and "k_old" when the corresponding mask bit is not set). + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE - dst[i+31:i] := v2[i+31:i] - k2_res[j] := k_old[j] + dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - Performs an element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] - sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 -ENDFOR + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - - Performs an element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] - sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] - sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 -ENDFOR +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - - Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + [round_note] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] - sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). - [round_note] + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] - sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 -ENDFOR +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Arithmetic - - - - - - - - Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] -FOR j := 0 to 15 - i := j*32 +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := v2[i+31:i] + v3[i+31:i] - sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+31:i] := src[i+31:i] + dst[i+63:i] := c[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - Performs element-by-element subtraction of packed 32-bit integer elements in "v3" from "v2", storing the results in "dst" and the nth borrow bit in the nth position of "borrow" (borrow flag). - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i]) -ENDFOR + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - - Performs element-by-element subtraction of packed 32-bit integer elements in "v3" from "v2", storing the results in "dst" and the nth borrow bit in the nth position of "borrow" (borrow flag). Results are stored using writemask "k" (elements are copied from "v2" and "k_old" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 IF k[j] - dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i]) + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE - dst[i+31:i] := v3[i+31:i] - borrow[j] := k_old[j] + dst[i+63:i] := a[i+63:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3", storing the results in "dst" and "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). - FOR j := 0 to 15 + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i]) -ENDFOR + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - - Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3", storing the results in "dst" and "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are written using writemask "k" (elements are copied from "k" to "k_old" when the corresponding mask bit is not set). - FOR j := 0 to 15 + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 i := j*32 - IF k[j] - diff := v3[i+31:i] - v2[i+31:i] - borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i]) - dst[i+31:i] := diff - v2[i+31:i] := diff - ELSE - borrow[j] := k_old[j] - FI -ENDFOR + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v3" as well as the corresponding bit from "k" from "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst". - FOR j := 0 to 15 + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k[j] - borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j]) -ENDFOR + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - - Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v3" as well as the corresponding bit from "k2" from "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set). - FOR j := 0 to 15 + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 i := j*32 - IF k1[j] - dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k2[j] - borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j]) + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+31:i] := v2[i+31:i] + dst[i+31:i] := c[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v2" as well as the corresponding bit from "k" from "v3". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst". - FOR j := 0 to 15 + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k[j] - borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j]) -ENDFOR + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Integer - KNCNI Arithmetic - - - - - - - Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v2" as well as the corresponding bit from "k2" from "v3". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set). - FOR j := 0 to 15 + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 i := j*32 - IF k1[j] - dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k2[j] - borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j]) + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE - dst[i+31:i] := v2[i+31:i] + dst[i+31:i] := a[i+31:i] FI -ENDFOR +ENDFOR dst[MAX:512] := 0 - + + + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Convert - - - - Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed single-precision (32-bit) floating-point elements, storing the results in "dst". Results are written to the lower half of "dst", and the upper half locations are set to '0'. - [round_note] + Arithmetic + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. FOR j := 0 to 7 i := j*64 - k := j*32 - dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i]) + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Convert - - - - - - Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed single-precision (32-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are written to the lower half of "dst", and the upper half locations are set to '0'. + Arithmetic + + + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_note] FOR j := 0 to 7 i := j*64 - l := j*32 IF k[j] - dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i]) + dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE - dst[l+31:l] := src[l+31:l] + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - KNCNI - Convert - - - - Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements, storing the results in "dst". Results are written to the lower half of "dst", and the upper half locations are set to '0'. - [round_note] + Arithmetic + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 7 i := j*64 - k := j*32 - dst[k+31:k] := Convert_FP64_To_Int32(v2[i+63:i]) + dst[i+63:i] := a[i+63:i] * b[i+63:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - KNCNI - Convert - - - - - - Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are written to the lower half of "dst", and the upper half locations are set to '0'. + Arithmetic + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". [round_note] FOR j := 0 to 7 i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_FP64_To_Int32(v2[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - Integer - KNCNI - Convert - - - - - Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in "v2" to packed 32-bit integer elements and performs an optional exponent adjust using "expadj", storing the results in "dst". - [round_note] - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := v2[i+31:i] - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC - dst[i+31:i] := Float32ToInt32(dst[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - Integer - KNCNI - Convert - - - - - Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements and performing an optional exponent adjust using "expadj", storing the results in "dst". - [round_note] - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := v2[i+31:i] - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC - dst[i+31:i] := Float32ToUInt32(dst[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - -
immintrin.h
-
- - Floating Point - Integer - KNCNI - Convert - - - - - Performs element-by-element conversion of packed 32-bit unsigned integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst". - [round_note] - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := UInt32ToFloat32(v2[i+31:i]) - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC + dst[i+63:i] := a[i+63:i] * b[i+63:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - KNCNI - Convert - - - - - - - Performs element-by-element conversion of packed 32-bit unsigned integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - FOR j := 0 to 15 + Arithmetic + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := Int32ToFloat32(v2[i+31:i]) - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC + dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Floating Point - KNCNI - Elementary Math Functions - - - Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in "v2" with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := exp223(v2[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Elementary Math Functions - - - - - Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in "v2" with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Arithmetic + + + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := exp223(v2[i+31:i]) + dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Miscellaneous - - - - - Fixes up NaN's from packed double-precision (64-bit) floating-point elements in "v1" and "v2", storing the results in "dst" and storing the quietized NaN's from "v1" in "v3". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i]) - v3[i+63:i] := QuietizeNaNs(v1[i+63:i]) + Arithmetic + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Miscellaneous - - - - - - Fixes up NaN's from packed double-precision (64-bit) floating-point elements in "v1" and "v2", storing the results in "dst" using writemask "k" (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from "v1" are stored in "v3". - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i]) - v3[i+63:i] := QuietizeNaNs(v1[i+63:i]) - FI + Arithmetic + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Miscellaneous - - - - - Fixes up NaN's from packed single-precision (32-bit) floating-point elements in "v1" and "v2", storing the results in "dst" and storing the quietized NaN's from "v1" in "v3". - FOR j := 0 to 15 + Arithmetic + + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i]) - v3[i+31:i] := QuietizeNaNs(v1[i+31:i]) + dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Miscellaneous - - - - - - Fixes up NaN's from packed single-precision (32-bit) floating-point elements in "v1" and "v2", storing the results in "dst" using writemask "k" (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from "v1" are stored in "v3". - FOR j := 0 to 15 + Arithmetic + + + + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 IF k[j] - dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i]) - v3[i+31:i] := QuietizeNaNs(v1[i+31:i]) + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_EPI32_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_EPI32_UINT8: - RETURN ZeroExtend32(MEM[addr + offset]) - _MM_UPCONV_EPI32_SINT8: - RETURN SignExtend32(MEM[addr + offset]) - _MM_UPCONV_EPI32_UINT16: - RETURN ZeroExtend32(MEM[addr + 2*offset]) - _MM_UPCONV_EPI32_SINT16: - RETURN SignExtend32(MEM[addr + 2*offset]) - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_EPI32_NONE: - RETURN 4 - _MM_UPCONV_EPI32_UINT8: - RETURN 1 - _MM_UPCONV_EPI32_SINT8: - RETURN 1 - _MM_UPCONV_EPI32_UINT16: - RETURN 2 - _MM_UPCONV_EPI32_SINT16: - RETURN 2 - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -upSize := UPCONVERTSIZE(conv) -addr := mt-64 + Arithmetic + + + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*upSize % 64) == 0 - foundNext64BytesBoundary := true - FI + i := j*32 + IF k[j] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] ELSE - i := j*32 - dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) + dst[i+31:i] := src[i+31:i] FI - loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - - Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_EPI32_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_EPI32_UINT8: - RETURN ZeroExtend32(MEM[addr + offset]) - _MM_UPCONV_EPI32_SINT8: - RETURN SignExtend32(MEM[addr + offset]) - _MM_UPCONV_EPI32_UINT16: - RETURN ZeroExtend32(MEM[addr + 2*offset]) - _MM_UPCONV_EPI32_SINT16: - RETURN SignExtend32(MEM[addr + 2*offset]) - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_EPI32_NONE: - RETURN 4 - _MM_UPCONV_EPI32_UINT8: - RETURN 1 - _MM_UPCONV_EPI32_SINT8: - RETURN 1 - _MM_UPCONV_EPI32_UINT16: - RETURN 2 - _MM_UPCONV_EPI32_SINT16: - RETURN 2 - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -upSize := UPCONVERTSIZE(conv) -addr := mt-64 + Arithmetic + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". + FOR j := 0 to 15 - IF k[j] - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*upSize % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*32 - dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) - FI - loadOffset := loadOffset + 1 - FI + i := j*32 + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + Arithmetic + + + + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_EPI32_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_EPI32_UINT8: - RETURN ZeroExtend32(MEM[addr + offset]) - _MM_UPCONV_EPI32_SINT8: - RETURN SignExtend32(MEM[addr + offset]) - _MM_UPCONV_EPI32_UINT16: - RETURN ZeroExtend32(MEM[addr + 2*offset]) - _MM_UPCONV_EPI32_SINT16: - RETURN SignExtend32(MEM[addr + 2*offset]) - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_EPI32_NONE: - RETURN 4 - _MM_UPCONV_EPI32_UINT8: - RETURN 1 - _MM_UPCONV_EPI32_SINT8: - RETURN 1 - _MM_UPCONV_EPI32_UINT16: - RETURN 2 - _MM_UPCONV_EPI32_SINT16: - RETURN 2 - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -upSize := UPCONVERTSIZE(conv) -addr := mt FOR j := 0 to 15 i := j*32 - dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) - loadOffset := loadOffset + 1 - IF (mt + loadOffset * upSize) % 64 == 0 - BREAK - FI + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - - Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Arithmetic + + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". -DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_EPI32_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_EPI32_UINT8: - RETURN ZeroExtend32(MEM[addr + offset]) - _MM_UPCONV_EPI32_SINT8: - RETURN SignExtend32(MEM[addr + offset]) - _MM_UPCONV_EPI32_UINT16: - RETURN ZeroExtend32(MEM[addr + 2*offset]) - _MM_UPCONV_EPI32_SINT16: - RETURN SignExtend32(MEM[addr + 2*offset]) - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_EPI32_NONE: - RETURN 4 - _MM_UPCONV_EPI32_UINT8: - RETURN 1 - _MM_UPCONV_EPI32_SINT8: - RETURN 1 - _MM_UPCONV_EPI32_UINT16: - RETURN 2 - _MM_UPCONV_EPI32_SINT16: - RETURN 2 - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -upSize := UPCONVERTSIZE(conv) -addr := mt FOR j := 0 to 15 - IF k[j] - i := j*32 - dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) - loadOffset := loadOffset + 1 - IF (mt + loadOffset * upSize) % 64 == 0 - BREAK - FI - FI + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_EPI64_NONE: - RETURN MEM[addr + 8*offset] - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_EPI64_NONE: - RETURN 8 - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -upSize := UPCONVERTSIZE(conv) -addr := mt-64 + Arithmetic + + + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*upSize) == 0 - foundNext64BytesBoundary := true - FI + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE - i := j*64 - dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + dst[i+63:i] := src[i+63:i] FI - loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - - Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_EPI64_NONE: - RETURN MEM[addr + 8*offset] - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_EPI64_NONE: - RETURN 8 - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -upSize := UPCONVERTSIZE(conv) -addr := mt-64 + Arithmetic + + + + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + FOR j := 0 to 7 + i := j*64 IF k[j] - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*upSize) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*64 - dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) - FI - loadOffset := loadOffset + 1 + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + Arithmetic + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". -DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_EPI64_NONE: - RETURN MEM[addr + 8*offset] - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_EPI64_NONE: - RETURN 8 - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -upSize := UPCONVERTSIZE(conv) -addr := mt FOR j := 0 to 7 i := j*64 - dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) - loadOffset := loadOffset + 1 - IF (addr + loadOffset*upSize % 64) == 0 - BREAK - FI + dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - - Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Arithmetic + + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + [round_note] -DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_EPI64_NONE: - RETURN MEM[addr + 8*offset] - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_EPI64_NONE: - RETURN 8 - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -upSize := UPCONVERTSIZE(conv) -addr := mt FOR j := 0 to 7 - IF k[j] - i := j*64 - dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) - loadOffset := loadOffset + 1 - IF (addr + loadOffset*upSize % 64) == 0 - BREAK - FI - FI + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PS_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) - _MM_UPCONV_PS_UINT8: - RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_SINT8: - RETURN Convert_Int8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_UINT16: - RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) - _MM_UPCONV_PS_SINT16: - RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_PS_NONE: - RETURN 4 - _MM_UPCONV_PS_FLOAT16: - RETURN 2 - _MM_UPCONV_PS_UINT8: - RETURN 1 - _MM_UPCONV_PS_SINT8: - RETURN 1 - _MM_UPCONV_PS_UINT16: - RETURN 2 - _MM_UPCONV_PS_SINT16: - RETURN 2 - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -upSize := UPCONVERTSIZE(conv) -addr := mt-64 + Arithmetic + + + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*upSize % 64) == 0 - foundNext64BytesBoundary := true - FI + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE - i := j*32 - dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) - FI - loadOffset := loadOffset + 1 + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - - Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PS_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) - _MM_UPCONV_PS_UINT8: - RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_SINT8: - RETURN Convert_Int8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_UINT16: - RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) - _MM_UPCONV_PS_SINT16: - RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) - ESAC -} -DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PS_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) - _MM_UPCONV_PS_UINT8: - RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_SINT8: - RETURN Convert_Int8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_UINT16: - RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) - _MM_UPCONV_PS_SINT16: - RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -upSize := UPCONVERTSIZE(conv) -addr := mt-64 + Arithmetic + + + + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + FOR j := 0 to 15 + i := j*32 IF k[j] - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*upSize % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*32 - dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) - FI - loadOffset := loadOffset + 1 - FI + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PS_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) - _MM_UPCONV_PS_UINT8: - RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_SINT8: - RETURN Convert_Int8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_UINT16: - RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) - _MM_UPCONV_PS_SINT16: - RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) - ESAC -} -DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PS_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) - _MM_UPCONV_PS_UINT8: - RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_SINT8: - RETURN Convert_Int8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_UINT16: - RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) - _MM_UPCONV_PS_SINT16: - RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -upSize := UPCONVERTSIZE(conv) -addr := MEM[mt] + Arithmetic + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 i := j*32 - dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) - loadOffset := loadOffset + 1 - IF (mt + loadOffset * upSize) % 64 == 0 - BREAK - FI + dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - - Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PS_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) - _MM_UPCONV_PS_UINT8: - RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_SINT8: - RETURN Convert_Int8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_UINT16: - RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) - _MM_UPCONV_PS_SINT16: - RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) - ESAC -} -DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PS_NONE: - RETURN MEM[addr + 4*offset] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset]) - _MM_UPCONV_PS_UINT8: - RETURN Convert_UInt8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_SINT8: - RETURN Convert_Int8_To_FP32(MEM[addr + offset]) - _MM_UPCONV_PS_UINT16: - RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset]) - _MM_UPCONV_PS_SINT16: - RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset]) - ESAC -} -dst[511:0] := src[511:0] -loadOffset := 0 -upSize := UPCONVERTSIZE(conv) -addr := MEM[mt] + Arithmetic + + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + [round_note] + FOR j := 0 to 15 - IF k[j] - i := j*32 - dst[i+31:i] := UPCONVERT(addr, loadOffset, conv) - loadOffset := loadOffset + 1 - IF (mt + loadOffset * upSize) % 64 == 0 - BREAK - FI - FI + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PD_NONE: - RETURN MEM[addr + 8*offset] - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_PD_NONE: - RETURN 8 - ESAC + Arithmetic + + + + + + Reduce the packed 32-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) } -dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -upSize := UPCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 7 - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*upSize) % 64 == 0 - foundNext64BytesBoundary := true - FI +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] ELSE - i := j*64 - dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) + tmp[i+31:i] := 0 FI - loadOffset := loadOffset + 1 ENDFOR -dst[MAX:512] := 0 +dst[31:0] := REDUCE_ADD(tmp, 16) - + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - - Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PD_NONE: - RETURN MEM[addr + 8*offset] - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_PD_NONE: - RETURN 8 - ESAC + Arithmetic + + + + + + Reduce the packed 64-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) } -dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -upSize := UPCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 7 +tmp := a +FOR j := 0 to 8 + i := j*64 IF k[j] - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*upSize) % 64 == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*64 - dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) - FI - loadOffset := loadOffset + 1 + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[63:0] := REDUCE_ADD(tmp, 8) - + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. + Arithmetic + + + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". -DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PD_NONE: - RETURN MEM[addr + 8*offset] - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_PD_NONE: - RETURN 8 - ESAC +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) } -dst[511:0] := src[511:0] -loadOffset := 0 -upSize := UPCONVERTSIZE(conv) -addr := mt -FOR j := 0 to 7 +tmp := a +FOR j := 0 to 8 i := j*64 - dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) - loadOffset := loadOffset + 1 - IF (mt + loadOffset * upSize) % 64 == 0 - BREAK + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[63:0] := REDUCE_ADD(tmp, 8) - + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - - Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elemenst are skipped when the corresponding mask bit is not set). + Arithmetic + + + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". -DEFINE UPCONVERT(addr, offset, convertTo) { - CASE conv OF - _MM_UPCONV_PD_NONE: - RETURN MEM[addr + 8*offset] - ESAC -} -DEFINE UPCONVERTSIZE(convertTo) { - CASE conv OF - _MM_UPCONV_PD_NONE: - RETURN 8 - ESAC +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) } -dst[511:0] := src[511:0] -loadOffset := 0 -upSize := UPCONVERTSIZE(conv) -addr := mt -FOR j := 0 to 7 +tmp := a +FOR j := 0 to 16 + i := j*32 IF k[j] - i := j*64 - dst[i+63:i] := UPCONVERT(addr, loadOffset, conv) - loadOffset := loadOffset + 1 - IF (mt + loadOffset * upSize) % 64 == 0 - BREAK - FI + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0 FI ENDFOR -dst[MAX:512] := 0 +dst[31:0] := REDUCE_ADD(tmp, 16) - + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. - DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_DOWNCONV_EPI32_NONE: - RETURN element[31:0] - _MM_DOWNCONV_EPI32_UINT8: - RETURN Truncate8(element[31:0]) - _MM_DOWNCONV_EPI32_SINT8: - RETURN Saturate8(element[31:0]) - _MM_DOWNCONV_EPI32_UINT16: - RETURN Truncate16(element[31:0]) - _MM_DOWNCONV_EPI32_SINT16: - RETURN Saturate16(element[31:0]) - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_DOWNCONV_EPI32_NONE: - RETURN 4 - _MM_DOWNCONV_EPI32_UINT8: - RETURN 1 - _MM_DOWNCONV_EPI32_SINT8: - RETURN 1 - _MM_DOWNCONV_EPI32_UINT16: - RETURN 2 - _MM_DOWNCONV_EPI32_SINT16: - RETURN 2 - ESAC + Arithmetic + + + + + + Reduce the packed 32-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) } -storeOffset := 0 -foundNext64BytesBoundary := false -downSize := DOWNCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 15 - IF foundNext64BytesBoundary == false - IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 - foundNext64BytesBoundary := true - FI +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] ELSE - i := j*32 - tmp := DOWNCONVERT(v1[i+31:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 4: MEM[storeAddr] := tmp[31:0] - 2: MEM[storeAddr] := tmp[15:0] - 1: MEM[storeAddr] := tmp[7:0] - ESAC + tmp[i+31:i] := 1 FI - storeOffset := storeOffset + 1 ENDFOR -dst[MAX:512] := 0 +dst[31:0] := REDUCE_MUL(tmp, 16) - + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresonding mask bit is not set). - DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_DOWNCONV_EPI32_NONE: - RETURN element[31:0] - _MM_DOWNCONV_EPI32_UINT8: - RETURN Truncate8(element[31:0]) - _MM_DOWNCONV_EPI32_SINT8: - RETURN Saturate8(element[31:0]) - _MM_DOWNCONV_EPI32_UINT16: - RETURN Truncate16(element[31:0]) - _MM_DOWNCONV_EPI32_SINT16: - RETURN Saturate16(element[31:0]) - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_DOWNCONV_EPI32_NONE: - RETURN 4 - _MM_DOWNCONV_EPI32_UINT8: - RETURN 1 - _MM_DOWNCONV_EPI32_SINT8: - RETURN 1 - _MM_DOWNCONV_EPI32_UINT16: - RETURN 2 - _MM_DOWNCONV_EPI32_SINT16: - RETURN 2 - ESAC + Arithmetic + + + + + + Reduce the packed 64-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) } -storeOffset := 0 -foundNext64BytesBoundary := false -downSize := DOWNCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 15 +tmp := a +FOR j := 0 to 8 + i := j*64 IF k[j] - IF foundNext64BytesBoundary == false - IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*32 - tmp := DOWNCONVERT(v1[i+31:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 4: MEM[storeAddr] := tmp[31:0] - 2: MEM[storeAddr] := tmp[15:0] - 1: MEM[storeAddr] := tmp[7:0] - ESAC - FI - storeOffset := storeOffset + 1 + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 1 FI ENDFOR -dst[MAX:512] := 0 +dst[63:0] := REDUCE_MUL(tmp, 8) - + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. + Arithmetic + + + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". -DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_DOWNCONV_EPI32_NONE: - RETURN element[31:0] - _MM_DOWNCONV_EPI32_UINT8: - RETURN Truncate8(element[31:0]) - _MM_DOWNCONV_EPI32_SINT8: - RETURN Saturate8(element[31:0]) - _MM_DOWNCONV_EPI32_UINT16: - RETURN Truncate16(element[31:0]) - _MM_DOWNCONV_EPI32_SINT16: - RETURN Saturate16(element[31:0]) - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_DOWNCONV_EPI32_NONE: - RETURN 4 - _MM_DOWNCONV_EPI32_UINT8: - RETURN 1 - _MM_DOWNCONV_EPI32_SINT8: - RETURN 1 - _MM_DOWNCONV_EPI32_UINT16: - RETURN 2 - _MM_DOWNCONV_EPI32_SINT16: - RETURN 2 - ESAC +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) } -storeOffset := 0 -downSize := DOWNCONVERTSIZE(conv) -addr := mt -FOR j := 0 to 15 - i := j*32 - tmp := DOWNCONVERT(v1[i+31:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 4: MEM[storeAddr] := tmp[31:0] - 2: MEM[storeAddr] := tmp[15:0] - 1: MEM[storeAddr] := tmp[7:0] - ESAC - storeOffset := storeOffset + 1 - IF ((addr + storeOffset * downSize) % 64) == 0 - BREAK +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 1.0 FI ENDFOR -dst[MAX:512] := 0 +dst[63:0] := REDUCE_MUL(tmp, 8) - + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are written to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Arithmetic + + + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". -DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_DOWNCONV_EPI32_NONE: - RETURN element[31:0] - _MM_DOWNCONV_EPI32_UINT8: - RETURN Truncate8(element[31:0]) - _MM_DOWNCONV_EPI32_SINT8: - RETURN Saturate8(element[31:0]) - _MM_DOWNCONV_EPI32_UINT16: - RETURN Truncate16(element[31:0]) - _MM_DOWNCONV_EPI32_SINT16: - RETURN Saturate16(element[31:0]) - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_DOWNCONV_EPI32_NONE: - RETURN 4 - _MM_DOWNCONV_EPI32_UINT8: - RETURN 1 - _MM_DOWNCONV_EPI32_SINT8: - RETURN 1 - _MM_DOWNCONV_EPI32_UINT16: - RETURN 2 - _MM_DOWNCONV_EPI32_SINT16: - RETURN 2 - ESAC +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) } -storeOffset := 0 -downSize := DOWNCONVERTSIZE(conv) -addr := mt -FOR j := 0 to 15 +tmp := a +FOR j := 0 to 16 + i := j*32 IF k[j] - i := j*32 - tmp := DOWNCONVERT(v1[i+31:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 4: MEM[storeAddr] := tmp[31:0] - 2: MEM[storeAddr] := tmp[15:0] - 1: MEM[storeAddr] := tmp[7:0] - ESAC - storeOffset := storeOffset + 1 - IF ((addr + storeOffset * downSize) % 64) == 0 - BREAK - FI + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := FP32(1.0) FI ENDFOR -dst[MAX:512] := 0 +dst[31:0] := REDUCE_MUL(tmp, 16) - + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. - DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_EPI64_NONE: - RETURN element[63:0] - ESAC + Arithmetic + + + + + Reduce the packed 32-bit integers in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) } -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_EPI64_NONE: - RETURN 8 - ESAC +dst[31:0] := REDUCE_ADD(a, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 64-bit integers in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) } -storeOffset := 0 -foundNext64BytesBoundary := false -downSize := DOWNCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 7 - IF foundNext64BytesBoundary == false - IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE +dst[63:0] := REDUCE_ADD(a, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) i := j*64 - tmp := DOWNCONVERT(v1[i+63:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 8: MEM[storeAddr] := tmp[63:0] - ESAC + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_ADD(a, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] FI - storeOffset := storeOffset + 1 -ENDFOR -dst[MAX:512] := 0 + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_ADD(a, 16) - + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (mt-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresonding mask bit is not set). - DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_EPI64_NONE: - RETURN element[63:0] - ESAC + Arithmetic + + + + + Reduce the packed 32-bit integers in "a" by multiplication. Returns the product of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) } -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_EPI64_NONE: - RETURN 8 - ESAC +dst[31:0] := REDUCE_MUL(a, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 64-bit integers in "a" by multiplication. Returns the product of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) } -storeOffset := 0 -foundNext64BytesBoundary := false -downSize := DOWNCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 7 - IF k[j] - IF foundNext64BytesBoundary == false - IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*64 - tmp := DOWNCONVERT(v1[i+63:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 8: MEM[storeAddr] := tmp[63:0] - ESAC - FI - storeOffset := storeOffset + 1 +dst[63:0] := REDUCE_MUL(a, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MUL(a, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MUL(a, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ABS(v2[i+31:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. + Arithmetic + + + + + + + Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_EPI64_NONE: - RETURN element[63:0] - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_EPI64_NONE: - RETURN 8 - ESAC -} -storeOffset := 0 -downSize := DOWNCONVERTSIZE(conv) -addr := mt -FOR j := 0 to 7 - i := j*63 - tmp := DOWNCONVERT(v1[i+63:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 8: MEM[storeAddr] := tmp[63:0] - ESAC - storeOffset := storeOffset + 1 - IF ((addr + storeOffset * downSize) % 64) == 0 - BREAK +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ABS(v2[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped whent he corresponding mask bit is not set). + Arithmetic + + + + + Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst". -DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_EPI64_NONE: - RETURN element[63:0] - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_EPI64_NONE: - RETURN 8 - ESAC -} -storeOffset := 0 -downSize := DOWNCONVERTSIZE(conv) -addr := mt FOR j := 0 to 7 - IF k[j] - i := j*63 - tmp := DOWNCONVERT(v1[i+63:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 8: MEM[storeAddr] := tmp[63:0] - ESAC - storeOffset := storeOffset + 1 - IF ((addr + storeOffset * downSize) % 64) == 0 - BREAK - FI - FI + i := j*64 + dst[i+63:i] := ABS(v2[i+63:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. - DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_PS_NONE: - RETURN element[31:0] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP32_To_FP16(element[31:0]) - _MM_UPCONV_PS_UINT8: - RETURN Truncate8(element[31:0]) - _MM_UPCONV_PS_SINT8: - RETURN Saturate8(element[31:0]) - _MM_UPCONV_PS_UINT16: - RETURN Truncate16(element[31:0]) - _MM_UPCONV_PS_SINT16: - RETURN Saturate16(element[31:0]) - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_PS_NONE: - RETURN 4 - _MM_UPCONV_PS_FLOAT16: - RETURN 2 - _MM_UPCONV_PS_UINT8: - RETURN 1 - _MM_UPCONV_PS_SINT8: - RETURN 1 - _MM_UPCONV_PS_UINT16: - RETURN 2 - _MM_UPCONV_PS_SINT16: - RETURN 2 - ESAC -} -storeOffset := 0 -foundNext64BytesBoundary := false -downSize := DOWNCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 15 - IF foundNext64BytesBoundary == false - IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 - foundNext64BytesBoundary := true - FI + Arithmetic + + + + + + + Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ABS(v2[i+63:i]) ELSE - i := j*32 - tmp := DOWNCONVERT(v1[i+31:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 4: MEM[storeAddr] := tmp[31:0] - 2: MEM[storeAddr] := tmp[15:0] - 1: MEM[storeAddr] := tmp[7:0] - ESAC + dst[i+63:i] := src[i+63:i] FI - storeOffset := storeOffset + 1 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_PS_NONE: - RETURN element[31:0] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP32_To_FP16(element[31:0]) - _MM_UPCONV_PS_UINT8: - RETURN Truncate8(element[31:0]) - _MM_UPCONV_PS_SINT8: - RETURN Saturate8(element[31:0]) - _MM_UPCONV_PS_UINT16: - RETURN Truncate16(element[31:0]) - _MM_UPCONV_PS_SINT16: - RETURN Saturate16(element[31:0]) - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_PS_NONE: - RETURN 4 - _MM_UPCONV_PS_FLOAT16: - RETURN 2 - _MM_UPCONV_PS_UINT8: - RETURN 1 - _MM_UPCONV_PS_SINT8: - RETURN 1 - _MM_UPCONV_PS_UINT16: - RETURN 2 - _MM_UPCONV_PS_SINT16: - RETURN 2 - ESAC -} -storeOffset := 0 -foundNext64BytesBoundary := false -downSize := DOWNCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 15 - IF k[j] - IF foundNext64BytesBoundary == false - IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*32 - tmp := DOWNCONVERT(v1[i+31:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 4: MEM[storeAddr] := tmp[31:0] - 2: MEM[storeAddr] := tmp[15:0] - 1: MEM[storeAddr] := tmp[7:0] - ESAC - FI - storeOffset := storeOffset + 1 - FI -ENDFOR + Arithmetic + + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst". + +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) +dst[511:0] := temp[511:0] dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. + Miscellaneous + + + + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_PS_NONE: - RETURN element[31:0] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP32_To_FP16(element[31:0]) - _MM_UPCONV_PS_UINT8: - RETURN Truncate8(element[31:0]) - _MM_UPCONV_PS_SINT8: - RETURN Saturate8(element[31:0]) - _MM_UPCONV_PS_UINT16: - RETURN Truncate16(element[31:0]) - _MM_UPCONV_PS_SINT16: - RETURN Saturate16(element[31:0]) - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_PS_NONE: - RETURN 4 - _MM_UPCONV_PS_FLOAT16: - RETURN 2 - _MM_UPCONV_PS_UINT8: - RETURN 1 - _MM_UPCONV_PS_SINT8: - RETURN 1 - _MM_UPCONV_PS_UINT16: - RETURN 2 - _MM_UPCONV_PS_SINT16: - RETURN 2 - ESAC -} -storeOffset := 0 -downSize := DOWNCONVERTSIZE(conv) -addr := mt +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) FOR j := 0 to 15 i := j*32 - tmp := DOWNCONVERT(v1[i+31:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 4: MEM[storeAddr] := tmp[31:0] - 2: MEM[storeAddr] := tmp[15:0] - 1: MEM[storeAddr] := tmp[7:0] - ESAC - storeOffset := storeOffset + 1 - IF ((addr + storeOffset * downSize) % 64) == 0 - BREAK + IF k[j] + dst[i+31:i] := temp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - -DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_PS_NONE: - RETURN element[31:0] - _MM_UPCONV_PS_FLOAT16: - RETURN Convert_FP32_To_FP16(element[31:0]) - _MM_UPCONV_PS_UINT8: - RETURN Truncate8(element[31:0]) - _MM_UPCONV_PS_SINT8: - RETURN Saturate8(element[31:0]) - _MM_UPCONV_PS_UINT16: - RETURN Truncate16(element[31:0]) - _MM_UPCONV_PS_SINT16: - RETURN Saturate16(element[31:0]) - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_PS_NONE: - RETURN 4 - _MM_UPCONV_PS_FLOAT16: - RETURN 2 - _MM_UPCONV_PS_UINT8: - RETURN 1 - _MM_UPCONV_PS_SINT8: - RETURN 1 - _MM_UPCONV_PS_UINT16: - RETURN 2 - _MM_UPCONV_PS_SINT16: - RETURN 2 - ESAC -} -storeOffset := 0 -downSize := DOWNCONVERTSIZE(conv) -addr := mt -FOR j := 0 to 15 - IF k[j] - i := j*32 - tmp := DOWNCONVERT(v1[i+31:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 4: MEM[storeAddr] := tmp[31:0] - 2: MEM[storeAddr] := tmp[15:0] - 1: MEM[storeAddr] := tmp[7:0] - ESAC - storeOffset := storeOffset + 1 - IF ((addr + storeOffset * downSize) % 64) == 0 - BREAK - FI - FI + Miscellaneous + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. - DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_PD_NONE: - RETURN element[63:0] - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_PD_NONE: - RETURN 8 - ESAC -} -storeOffset := 0 -foundNext64BytesBoundary := false -downSize := DOWNCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 7 - IF foundNext64BytesBoundary == false - IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*64 - tmp := DOWNCONVERT(v1[i+63:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 8: MEM[storeAddr] := tmp[63:0] - ESAC - FI - storeOffset := storeOffset + 1 + Miscellaneous + + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_PD_NONE: - RETURN element[63:0] - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_PD_NONE: - RETURN 8 - ESAC -} -storeOffset := 0 -foundNext64BytesBoundary := false -downSize := DOWNCONVERTSIZE(conv) -addr := mt-64 -FOR j := 0 to 7 + Miscellaneous + + + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 IF k[j] - IF foundNext64BytesBoundary == false - IF ((addr + (storeOffset + 1)*downSize) % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*64 - tmp := DOWNCONVERT(v1[i+63:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 8: MEM[storeAddr] := tmp[63:0] - ESAC - FI - storeOffset := storeOffset + 1 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. - -DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_PD_NONE: - RETURN element[63:0] - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_PD_NONE: - RETURN 8 - ESAC -} -storeOffset := 0 -downSize := DOWNCONVERTSIZE(conv) -addr := mt -FOR j := 0 to 7 - i := j*63 - tmp := DOWNCONVERT(v1[i+63:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 8: MEM[storeAddr] := tmp[63:0] - ESAC - storeOffset := storeOffset + 1 - IF ((addr + storeOffset * downSize) % 64) == 0 - BREAK + Miscellaneous + + + + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - -DEFINE DOWNCONVERT(element, convertTo) { - CASE convertTo OF - _MM_UPCONV_PD_NONE: - RETURN element[63:0] - ESAC -} -DEFINE DOWNCONVERTSIZE(convertTo) { - CASE convertTo OF - _MM_UPCONV_PD_NONE: - RETURN 8 - ESAC -} -storeOffset := 0 -downSize := DOWNCONVERTSIZE(conv) -addr := mt -FOR j := 0 to 7 - IF k[j] - i := j*63 - tmp := DOWNCONVERT(v1[i+63:i], conv) - storeAddr := addr + storeOffset * downSize - CASE downSize OF - 8: MEM[storeAddr] := tmp[63:0] - ESAC - storeOffset := storeOffset + 1 - IF ((addr + storeOffset * downSize) % 64) == 0 - BREAK - FI - FI + Miscellaneous + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + Miscellaneous + + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements whose corresponding mask bit is not set are not written to memory). - -FOR j := 0 to 7 - i := j*64 - m := j*32 + Miscellaneous + + + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 + i := j*32 IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". - dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -addr := mt-64 -FOR j := 0 to 15 - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*4 % 64) == 0 - foundNext64BytesBoundary := true - FI + Miscellaneous + + + + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE - i := j*32 - tmp := MEM[addr + loadOffset*4] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := src[i+31:i] FI - loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -addr := mt-64 -FOR j := 0 to 15 - IF k[j] - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*4 % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*32 - tmp := MEM[addr + loadOffset*4] - dst[i+31:i] := tmp[i+31:i] - FI - loadOffset := loadOffset + 1 - FI + Miscellaneous + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". - -dst[511:0] := src[511:0] -loadOffset := 0 -addr := mt -FOR j := 0 to 15 - i := j*32 - tmp := MEM[addr + loadOffset*4] - dst[i+31:i] := tmp[i+31:i] - loadOffset := loadOffset + 1 - IF (mt + loadOffset * 4) % 64 == 0 - BREAK - FI + Miscellaneous + + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - -dst[511:0] := src[511:0] -loadOffset := 0 -addr := mt -FOR j := 0 to 15 - i := j*32 + Miscellaneous + + + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*64 IF k[j] - tmp := MEM[addr + loadOffset*4] - dst[i+31:i] := tmp[i+31:i] - loadOffset := loadOffset + 1 - IF (mt + loadOffset * 4) % 64 == 0 - BREAK - FI + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". - dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -addr := mt-64 -FOR j := 0 to 7 - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*8) == 0 - foundNext64BytesBoundary := true - FI + Miscellaneous + + + + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE - i := j*64 - tmp := MEM[addr + loadOffset*8] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := src[i+63:i] FI - loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -addr := mt-64 -FOR j := 0 to 7 - IF k[j] - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*8) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*64 - tmp := MEM[addr + loadOffset*8] - dst[i+63:i] := tmp[i+63:i] - FI - loadOffset := loadOffset + 1 - FI + Miscellaneous + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". - -dst[511:0] := src[511:0] -loadOffset := 0 -addr := mt -FOR j := 0 to 7 - i := j*64 - tmp := MEM[addr + loadOffset*8] - dst[i+63:i] := tmp[i+63:i] - loadOffset := loadOffset + 1 - IF (addr + loadOffset*8 % 64) == 0 - BREAK - FI + Miscellaneous + + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - -dst[511:0] := src[511:0] -loadOffset := 0 -addr := mt -FOR j := 0 to 7 - i := j*64 + Miscellaneous + + + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 15 + i := j*32 IF k[j] - tmp := MEM[addr + loadOffset*8] - dst[i+63:i] := tmp[i+63:i] - loadOffset := loadOffset + 1 - IF (addr + loadOffset*8 % 64) == 0 - BREAK - FI + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". - dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -addr := mt-64 -FOR j := 0 to 15 - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*4 % 64) == 0 - foundNext64BytesBoundary := true - FI + Miscellaneous + + + + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE - i := j*32 - tmp := MEM[addr + loadOffset*4] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := src[i+31:i] FI - loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - Loads the high-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -addr := mt-64 -FOR j := 0 to 15 + Miscellaneous + + + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 IF k[j] - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*4 % 64) == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*32 - tmp := MEM[addr + loadOffset*4] - dst[i+31:i] := tmp[i+31:i] - FI - loadOffset := loadOffset + 1 + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". + Swizzle + + + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". -dst[511:0] := src[511:0] -loadOffset := 0 -addr := mt FOR j := 0 to 15 i := j*32 - tmp := MEM[addr + loadOffset*4] - dst[i+31:i] := tmp[i+31:i] - loadOffset := loadOffset + 1 - IF (mt + loadOffset * 4) % 64 == 0 - BREAK + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Swizzle + + + + + + + Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". -dst[511:0] := src[511:0] -loadOffset := 0 -addr := mt FOR j := 0 to 15 i := j*32 IF k[j] - tmp := MEM[addr + loadOffset*4] - dst[i+31:i] := tmp[i+31:i] - loadOffset := loadOffset + 1 - IF (mt + loadOffset * 4) % 64 == 0 - BREAK - FI + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". - dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -addr := mt-64 + Swizzle + + + + + + + Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + FOR j := 0 to 7 - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*8) % 64 == 0 - foundNext64BytesBoundary := true - FI + i := j*64 + IF k[j] + dst[i+63:i] := b[i+63:i] ELSE - i := j*64 - tmp := MEM[addr + loadOffset*8] - dst[i+63:i] := tmp[i+63:i] + dst[i+63:i] := a[i+63:i] FI - loadOffset := loadOffset + 1 ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - dst[511:0] := src[511:0] -loadOffset := 0 -foundNext64BytesBoundary := false -addr := mt-64 -FOR j := 0 to 7 + Swizzle + + + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_mask_permutexvar_epi32", and it is recommended that you use that intrinsic name. + +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 IF k[j] - IF foundNext64BytesBoundary == false - IF (addr + (loadOffset + 1)*8) % 64 == 0 - foundNext64BytesBoundary := true - FI - ELSE - i := j*64 - tmp := MEM[addr + loadOffset*8] - dst[i+63:i] := tmp[i+63:i] - FI - loadOffset := loadOffset + 1 + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". + Swizzle + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_permutexvar_epi32", and it is recommended that you use that intrinsic name. -dst[511:0] := src[511:0] -loadOffset := 0 -addr := mt -FOR j := 0 to 7 - i := j*64 - tmp := MEM[addr + loadOffset*8] - dst[i+63:i] := tmp[i+63:i] - loadOffset := loadOffset + 1 - IF ((addr + 8*loadOffset) % 64) == 0 - BREAK - FI +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Swizzle + + + + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[511:0] := src[511:0] -loadOffset := 0 -addr := mt -FOR j := 0 to 7 - i := j*64 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 IF k[j] - tmp := MEM[addr + loadOffset*8] - dst[i+63:i] := tmp[i+63:i] - loadOffset := loadOffset + 1 - IF ((addr + 8*loadOffset) % 64) == 0 - BREAK - FI + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)). + Swizzle + + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". -storeOffset := 0 -foundNext64BytesBoundary := 0 -addr := mt-64 -FOR j := 0 to 15 - IF foundNext64BytesBoundary == 0 - IF ((addr + (storeOffset + 1)*4) % 64) == 0 - foundNext64BytesBoundary := 1 - FI - ELSE - i := j*32 - MEM[addr + storeOffset*4] := v1[i+31:i] - FI - storeOffset := storeOffset + 1 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - -storeOffset := 0 -foundNext64BytesBoundary := 0 -addr := mt-64 -FOR j := 0 to 15 - IF k[j] - IF foundNext64BytesBoundary == 0 - IF ((addr + (storeOffset + 1)*4) % 64) == 0 - foundNext64BytesBoundary := 1 - FI - ELSE - i := j*32 - MEM[addr + storeOffset*4] := v1[i+31:i] - FI - storeOffset := storeOffset + 1 - FI + Compare + + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". -storeOffset := 0 -addr := mt -FOR j := 0 to 15 - i := j*32 - MEM[addr + storeOffset*4] := v1[i+31:i] - storeOffset := storeOffset + 1 - IF ((addr + storeOffset*4) % 64) == 0 - BREAK - FI -ENDFOR +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". -storeOffset := 0 -addr := mt -FOR j := 0 to 15 - IF k[j] - i := j*32 - MEM[addr + storeOffset*4] := v1[i+31:i] - storeOffset := storeOffset + 1 - IF ((addr + storeOffset*4) % 64) == 0 - BREAK - FI - FI +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". -storeOffset := 0 -foundNext64BytesBoundary := 0 -addr := mt-64 FOR j := 0 to 7 - IF foundNext64BytesBoundary == 0 - IF ((addr + (storeOffset + 1)*8) % 64) == 0 - foundNext64BytesBoundary := 1 - FI - ELSE - i := j*64 - MEM[addr + storeOffset*8] := v1[i+63:i] - FI - storeOffset := storeOffset + 1 + i := j*64 + k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". -storeOffset := 0 -foundNext64BytesBoundary := 0 -addr := mt-64 FOR j := 0 to 7 - IF k[j] - IF foundNext64BytesBoundary == 0 - IF ((addr + (storeOffset + 1)*8) % 64) == 0 - foundNext64BytesBoundary := 1 - FI - ELSE - i := j*64 - MEM[addr + storeOffset*8] := v1[i+63:i] - FI - storeOffset := storeOffset + 1 - FI + i := j*64 + k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". -storeOffset := 0 -addr := mt FOR j := 0 to 7 i := j*64 - MEM[addr + storeOffset*8] := v1[i+63:i] - storeOffset := storeOffset + 1 - IF ((addr + storeOffset*8) % 64) == 0 - BREAK - FI + k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". -storeOffset := 0 -addr := mt FOR j := 0 to 7 - IF k[j] - i := j*64 - MEM[addr + storeOffset*8] := v1[i+63:i] - storeOffset := storeOffset + 1 - IF ((addr + storeOffset*8) % 64) == 0 - BREAK - FI - FI + i := j*64 + k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). - -storeOffset := 0 -foundNext64BytesBoundary := 0 -addr := mt-64 -FOR j := 0 to 15 - IF foundNext64BytesBoundary == 0 - IF ((addr + (storeOffset + 1)*4) % 64) == 0 - foundNext64BytesBoundary := 1 - FI - ELSE - i := j*32 - MEM[addr + storeOffset*4] := v1[i+31:i] - FI - storeOffset := storeOffset + 1 + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". + FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - -storeOffset := 0 -foundNext64BytesBoundary := 0 -addr := mt-64 -FOR j := 0 to 15 - IF k[j] - IF foundNext64BytesBoundary == 0 - IF ((addr + (storeOffset + 1)*4) % 64) == 0 - foundNext64BytesBoundary := 1 - FI - ELSE - i := j*32 - MEM[addr + storeOffset*4] := v1[i+31:i] - FI - storeOffset := storeOffset + 1 - FI + Compare + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". + FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). - -storeOffset := 0 -addr := mt -FOR j := 0 to 15 - i := j*32 - MEM[addr + storeOffset*4] := v1[i+31:i] - storeOffset := storeOffset + 1 - IF ((addr + storeOffset*4) % 64) == 0 - BREAK + Compare + + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). - -storeOffset := 0 -addr := mt -FOR j := 0 to 15 - IF k[j] - i := j*32 - MEM[addr + storeOffset*4] := v1[i+31:i] - storeOffset := storeOffset + 1 - IF ((addr + storeOffset*4) % 64) == 0 - BREAK - FI + Compare + + + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -storeOffset := 0 -foundNext64BytesBoundary := 0 -addr := mt-64 FOR j := 0 to 7 - IF foundNext64BytesBoundary == 0 - IF ((addr + (storeOffset + 1)*8) % 64) == 0 - foundNext64BytesBoundary := 1 - FI - ELSE - i := j*64 - MEM[addr + storeOffset*4] := v1[i+63:i] + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 FI - storeOffset := storeOffset + 1 -ENDFOR +ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -storeOffset := 0 -foundNext64BytesBoundary := 0 -addr := mt-64 FOR j := 0 to 7 - IF k[j] - IF foundNext64BytesBoundary == 0 - IF ((addr + (storeOffset + 1)*8) % 64) == 0 - foundNext64BytesBoundary := 1 - FI - ELSE - i := j*64 - MEM[addr + storeOffset*4] := v1[i+63:i] - FI - storeOffset := storeOffset + 1 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -storeOffset := 0 -addr := mt FOR j := 0 to 7 i := j*64 - MEM[addr + storeOffset*8] := v1[i+63:i] - storeOffset := storeOffset + 1 - IF ((addr + storeOffset*8) % 64) == 0 - BREAK + IF k1[j] + k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set). + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -storeOffset := 0 -addr := mt FOR j := 0 to 7 - IF k[j] - i := j*64 - MEM[addr + storeOffset*8] := v1[i+63:i] - storeOffset := storeOffset + 1 - IF ((addr + storeOffset*8) % 64) == 0 - BREAK - FI + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR +k[MAX:8] := 0 - -
immintrin.h
-
- - KNCNI - Bit Manipulation - - - Counts the number of set bits in 32-bit unsigned integer "r1", returning the results in "dst". - dst[31:0] := PopCount(r1[31:0]) - - -
immintrin.h
-
- - KNCNI - Bit Manipulation - - - Counts the number of set bits in 64-bit unsigned integer "r1", returning the results in "dst". - dst[63:0] := PopCount(r1[63:0]) - - + + AVX512F
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Inserts the low byte of mask "k2" into the high byte of "dst", and copies the low byte of "k1" to the low byte of "dst". + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). -dst[7:0] := k1[7:0] -dst[15:8] := k2[7:0] +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - KNCNI - Convert - - - - Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector "v2" to 32-bit integer elements, storing them in the lower half of "dst". The elements in the upper half of "dst" are set to 0. - [round_note] + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - k := j*32 - dst[k+31:k] := Convert_FP64_To_Int32(v2[i+63:i]) + IF k1[j] + k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - KNCNI - Convert - - - - - - Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector "v2" to 32-bit integer elements, storing them in the lower half of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements in the upper half of "dst" are set to 0. - [round_note] - -FOR j := 0 to 7 + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_FP64_To_Int32(v2[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] + IF k1[j] + k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - Integer - KNCNI - Convert - - - - - Performs element-by-element conversion of packed 32-bit integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst". - [round_note] - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := Int32ToFloat32(v2[i+31:i]) - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC + Compare + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:8] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Elementary Math Functions - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a" with absolute error of 2^(-23) and store the results in "dst". - + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 15 i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) + k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Elementary Math Functions - - - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a" with absolute error of 2^(-23) and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - + Compare + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - - Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst". + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:512] := 0 + k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - - - Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI + k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - - - Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI + k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - Multiply packed 32-bit integer elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst". + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - base := (j & ~0x3) * 32 - scale[31:0] := b[base+63:base+32] - bias[31:0] := b[base+31:base] - dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] + k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - - - Multiply packed 32-bit integer elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - IF k[j] - base := (j & ~0x3) * 32 - scale[31:0] := b[base+63:base+32] - bias[31:0] := b[base+31:base] - dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Arithmetic - - - - - Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst". - [round_note] + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - base := (j & ~0x3) * 32 - scale[31:0] := b[base+63:base+32] - bias[31:0] := b[base+31:base] - dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] + k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Arithmetic - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". + FOR j := 0 to 15 i := j*32 - IF k[j] - base := (j & ~0x3) * 32 - scale[31:0] := b[base+63:base+32] - bias[31:0] := b[base+31:base] - dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst". + Compare + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i])) + k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - - - Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Compare + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i])) - ELSE - dst[i+31:i] := src[i+31:i] + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the maximum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst". - FOR j := 0 to 15 + Compare + + + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i]) + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - - - Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] + IF k1[j] + k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 FI -ENDFOR -dst[MAX:512] := 0 +ENDFOR +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst". - FOR j := 0 to 15 + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i])) + IF k1[j] + k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - - - Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i])) - ELSE - dst[i+31:i] := src[i+31:i] + IF k1[j] + k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the maximum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i]) + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - - - Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the minimum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst". - FOR j := 0 to 15 + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i]) + IF k1[j] + k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - - - Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] + IF k1[j] + k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the minimum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i]) + Compare + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - - - Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI + Compare + + + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - Performs element-by-element multiplication between packed 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst". + Compare + + + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - - - Performs element-by-element multiplication between packed 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - Performs element-by-element multiplication between packed unsigned 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst". + Compare + + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Arithmetic - - - - - - Performs element-by-element multiplication between packed unsigned 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Swizzle - - - - Permutes 128-bit blocks of the packed 32-bit integer vector "a" using constant "imm8". The results are stored in "dst". + Compare + + + + + + Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". -DEFINE SELECT4(src, control) { - CASE control[1:0] OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -FOR j := 0 to 3 - i := j*128 - n := j*2 - dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Swizzle - - - - - - Permutes 128-bit blocks of the packed 32-bit integer vector "a" using constant "imm8". The results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE control[1:0] OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp[511:0] := 0 -FOR j := 0 to 3 - i := j*128 - n := j*2 - tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) -ENDFOR + Compare + + + + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 15 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Elementary Math Functions - - - Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of precision, storing the results in "dst". - FOR j := 0 to 15 + Compare + + + + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Elementary Math Functions - - - - - Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of precision, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Compare + + + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Convert - - - - - Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value using "expadj" and in the direction of "rounding", and store the results as packed single-precision floating-point elements in "dst". - [round_note] - FOR j := 0 to 15 + Compare + + + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ROUND(a[i+31:i]) - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Convert - - - - - - - Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value using "expadj" and in the direction of "rounding", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - FOR j := 0 to 15 + Compare + + + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := ROUND(a[i+31:i]) - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC - ELSE - dst[i+31:i] := src[i+31:i] + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Miscellaneous - - - - - Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst". - [round_note] - FOR j := 0 to 15 + Compare + + + + + + + Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ROUND(a[i+31:i]) - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Miscellaneous - - - - - - - Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - FOR j := 0 to 15 + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := ROUND(a[i+31:i]) - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Miscellaneous - - - - - Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst". - [round_note] - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i]) - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Miscellaneous - - - - - - - Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ROUND(a[i+63:i]) - CASE expadj OF - _MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 << 0) - _MM_EXPADJ_4: dst[i+31:i] := dst[i+31:i] * (2 << 4) - _MM_EXPADJ_5: dst[i+31:i] := dst[i+31:i] * (2 << 5) - _MM_EXPADJ_8: dst[i+31:i] := dst[i+31:i] * (2 << 8) - _MM_EXPADJ_16: dst[i+31:i] := dst[i+31:i] * (2 << 16) - _MM_EXPADJ_24: dst[i+31:i] := dst[i+31:i] * (2 << 24) - _MM_EXPADJ_31: dst[i+31:i] := dst[i+31:i] * (2 << 31) - _MM_EXPADJ_32: dst[i+31:i] := dst[i+31:i] * (2 << 32) - ESAC - ELSE - dst[i+63:i] := src[i+63:i] - FI + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Elementary Math Functions - - - Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of accuracy and stores the result in "dst". - FOR j := 0 to 15 + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := Sqrt(1.0 / a[i+31:i]) + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Elementary Math Functions - - - - - Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of accuracy and stores the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := Sqrt(1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Arithmetic - - - - Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst". + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - dst[i+31:i] := a[i+31:i] * POW(2.0, FP32(b[i+31:i])) + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Arithmetic - - - - - - Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Compare + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * POW(2.0, FP32(b[i+31:i])) - ELSE - dst[i+31:i] := src[i+31:i] - FI + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Arithmetic - - - - - Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst". Intermediate elements are rounded using "rounding". - [round_note] - + Compare + + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC FOR j := 0 to 15 i := j*32 - dst[i+31:i] := a[i+31:i] * POW(2.0,FP32(b[i+31:i])) + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Arithmetic - - - - - - - Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exp, where the exp is the corresponding 32-bit integer element in "b", storing results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are rounded using constant "rounding". - [round_note] + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). FOR j := 0 to 15 i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * POW(2.0, FP32(b[i+31:i])) - ELSE - dst[i+31:i] := src[i+31:i] + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst[MAX:512] := 0 +k[MAX:16] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst". - min := a[31:0] -FOR j := 1 to 15 + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - dst := FpMin(min, a[i+31:i]) + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst := min +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst" using writemask "k" (elements are ignored when the corresponding mask bit is not set). - min := a[31:0] -FOR j := 1 to 15 + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - IF k[j] - CONTINUE - ELSE - dst := FpMin(min, a[i+31:i]) + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst := min +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". - min := a[63:0] -FOR j := 1 to 7 - i := j*64 - dst := FpMin(min, a[i+63:i]) + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst := min +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set). - min := a[63:0] -FOR j := 1 to 7 - i := j*64 - IF k[j] - CONTINUE - ELSE - dst := FpMin(min, a[i+63:i]) + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 FI ENDFOR -dst := min +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst". - max := a[31:0] -FOR j := 1 to 15 + Compare + + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 i := j*32 - dst := FpMax(max, a[i+31:i]) + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI ENDFOR -dst := max +k[MAX:16] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set). - max := a[31:0] -FOR j := 1 to 15 + Compare + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 + i := j*32 + m := j*32 IF k[j] - CONTINUE + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE - dst := FpMax(max, a[i+31:i]) + dst[i+31:i] := src[i+31:i] FI ENDFOR -dst := max +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". - max := a[63:0] -FOR j := 1 to 7 - i := j*64 - dst := FpMax(max, a[i+63:i]) -ENDFOR -dst := max + Load + + + + + Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Special Math Functions - - - - Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set). - max := a[63:0] -FOR j := 1 to 7 + Load + + + + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 i := j*64 IF k[j] - CONTINUE + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] ELSE - dst := FpMax(max, a[i+63:i]) + dst[i+63:i] := src[i+63:i] FI ENDFOR -dst := max - -
immintrin.h
-
- - KNCNI - Bit Manipulation - - - - Count the number of trailing zero bits in unsigned 32-bit integer "x" starting at bit "a", and return that count in "dst". - -tmp := a -IF tmp < 0 - tmp := 0 -FI -dst := 0 -IF tmp > 31 - dst := 32 -ELSE - DO WHILE ((tmp < 32) AND x[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 - OD -FI +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - KNCNI - Bit Manipulation - - - - Count the number of trailing zero bits in unsigned 64-bit integer "x" starting at bit "a", and return that count in "dst". + Load + + + + + Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -tmp := a -IF tmp < 0 - tmp := 0 -FI -dst := 0 -IF tmp > 63 - dst := 64 -ELSE - DO WHILE ((tmp < 64) AND x[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 - OD -FI - - -
immintrin.h
-
- - KNCNI - General Support - - - Stalls a thread without blocking other threads for 32-bit unsigned integer "r1" clock cycles. - BlockThread(r1) - - -
immintrin.h
-
- - KNCNI - General Support - - - Stalls a thread without blocking other threads for 64-bit unsigned integer "r1" clock cycles. - BlockThread(r1) - - -
immintrin.h
-
- - KNCNI - General Support - - - Set performance monitoring filtering mask to 32-bit unsigned integer "r1". - SetPerfMonMask(r1[31:0]) - - -
immintrin.h
-
- - KNCNI - General Support - - - Set performance monitoring filtering mask to 64-bit unsigned integer "r1". - SetPerfMonMask(r1[63:0]) - - -
immintrin.h
-
- - KNCNI - General Support - - - - Evicts the cache line containing the address "ptr" from cache level "level" (can be either 0 or 1). - CacheLineEvict(ptr, level) - - - -
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Performs a bitwise AND operation between NOT of "k2" and "k1", storing the result in "dst". - dst[15:0] := NOT(k2[15:0]) & k1[15:0] +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Moves high byte from "k2" to low byte of "k1", and moves low byte of "k2" to high byte of "k1". + Load + + + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -tmp[7:0] := k2[15:8] -k2[15:8] := k1[7:0] -k1[7:0] := tmp[7:0] -tmp[7:0] := k2[7:0] -k2[7:0] := k1[15:8] -k1[15:8] := tmp[7:0] - - -
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0. - dst[15:0] := k1[15:0] | k2[15:0] -IF dst == 0 - SetZF() -FI - - -
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's. - dst[15:0] := k1[15:0] | k2[15:0] -IF PopCount(dst[15:0]) == 16 - SetCF() -FI +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - KNCNI - Mask - - - Converts bit mask "k1" into an integer value, storing the results in "dst". + Load + + + + + Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -dst := ZeroExtend32(k1) +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - KNCNI - Mask - - - Converts integer "mask" into bitmask, storing the result in "dst". + Load + + + + + Load 512-bits of integer data from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -dst := mask[15:0] +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Packs masks "k1" and "k2" into the high 32 bits of "dst". The rest of "dst" is set to 0. + Load + + + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -dst[63:48] := k1[15:0] -dst[47:32] := k2[15:0] -dst[31:0] := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Packs masks "k1" and "k2" into the low 32 bits of "dst". The rest of "dst" is set to 0. + Load + + + + + Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -dst[31:16] := k1[15:0] -dst[15:0] := k2[15:0] -dst[63:32] := 0 +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Extracts 16-bit value "b" from 64-bit integer "a", storing the result in "dst". + Load + + + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -CASE b[1:0] OF -0: dst[15:0] := a[63:48] -1: dst[15:0] := a[47:32] -2: dst[15:0] := a[31:16] -3: dst[15:0] := a[15:0] -ESAC -dst[MAX:15] := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Arithmetic - - - - Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst". + Load + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. FOR j := 0 to 15 i := j*32 - base := (j & ~0x3) * 32 - scale[31:0] := b[base+63:base+32] - bias[31:0] := b[base+31:base] - dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Arithmetic - - - - - - Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Load + + + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. FOR j := 0 to 15 i := j*32 + m := j*32 IF k[j] - base := (j & ~0x3) * 32 - scale[31:0] := b[base+63:base+32] - bias[31:0] := b[base+31:base] - dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI Load - - - - - - - Up-converts 8 single-precision (32-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst". "hint" indicates to the processor whether the data is non-temporal. + + + + + + + + + Up-converts 16 memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst". AVX512 only supports _MM_UPCONV_EPI32_NONE. -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 CASE conv OF _MM_UPCONV_EPI32_NONE: dst[i+31:i] := MEM[addr+31:addr] @@ -131356,28 +151079,28 @@ FOR j := 0 to 7 _MM_UPCONV_EPI32_SINT16: dst[i+31:i] := SignExtend32(MEM[addr+15:addr]) ESAC ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Integer - KNCNI Load - - - - - - - - - Up-converts 8 single-precision (32-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + + + + + + + + + + + Up-converts 16 single-precision (32-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). AVX512 only supports _MM_UPCONV_EPI32_NONE. -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 IF k[j] CASE conv OF @@ -131391,26 +151114,26 @@ FOR j := 0 to 7 dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Integer - KNCNI Load - - - - - - - Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst". "hint" indicates to the processor whether the load is non-temporal. + + + + + + + + + Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst". FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 CASE conv OF _MM_UPCONV_EPI64_NONE: dst[i+63:i] := MEM[addr+63:addr] @@ -131418,26 +151141,26 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Integer - KNCNI Load - - - - - - - - - Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the load is non-temporal. + + + + + + + + + + + Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 IF k[j] CASE conv OF @@ -131449,24 +151172,24 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Load - - - - - - - Up-converts 8 memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in the lower half of "dst". "hint" indicates to the processor whether the load is non-temporal. + + + + + + + + + Up-converts 16 memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in "dst". AVX512 only supports _MM_UPCONV_PS_NONE. -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 CASE conv OF _MM_UPCONV_PS_NONE: dst[i+31:i] := MEM[addr+31:addr] @@ -131477,28 +151200,28 @@ FOR j := 0 to 7 _MM_UPCONV_PS_SINT16: dst[i+31:i] := Convert_Int16_To_FP32(MEM[addr+15:addr]) ESAC ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Load - - - - - - - - - Up-converts 8 memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in the lower half of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the load is non-temporal. + + + + + + + + + + + Up-converts 16 single-precision (32-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). AVX512 only supports _MM_UPCONV_PS_NONE. -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 IF k[j] CASE conv OF @@ -131513,26 +151236,26 @@ FOR j := 0 to 7 dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Load - - - - - - - Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst". "hint" indicates to the processor whether the data is non-temporal. + + + + + + + + + Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst". FOR j := 0 to 7 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 CASE conv OF _MM_UPCONV_PD_NONE: dst[i+63:i] := MEM[addr+63:addr] @@ -131540,30 +151263,31 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Load - - - - - - - - - Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal. + + + + + + + + + + + Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 IF k[j] CASE conv OF - _MM_UPCONV_PD_NONE: dst[i+63:i] := MEM[addr+63:addr] + _MM_UPCONV_PD_NONE: + dst[i+63:i] := MEM[addr+63:addr] ESAC ELSE dst[i+63:i] := src[i+63:i] @@ -131571,1960 +151295,2167 @@ FOR j := 0 to 7 ENDFOR dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - - Down-converts 8 packed single-precision (32-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - - CASE conv OF - _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] - _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) - _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) - _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) - _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) - _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) - ESAC -ENDFOR - -
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - - - Down-converts 8 packed single-precision (32-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Elements are only written when the corresponding mask bit is set in "k"; otherwise, elements are unchanged in memory. "hint" indicates to the processor whether the data is non-temporal. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - - IF k[j] - CASE conv OF - _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] - _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) - _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) - _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) - _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) - _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) - ESAC - FI -ENDFOR - -
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - - Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. + Load + + + + + + + Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst". FOR j := 0 to 7 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - - CASE conv OF - _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] - ESAC + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ENDFOR +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - - - - - - Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Elements are written to memory using writemask "k" (elements are not stored to memory when the corresponding mask bit is not set; the memory location is left unchagned). "hint" indicates to the processor whether the data is non-temporal. + Load + + + + + + + + + Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - - IF k[j] - CASE conv OF - _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] - ESAC - FI -ENDFOR - -
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - - Down-converts the low 8 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - - CASE conv OF - _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] - _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) - _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) - _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) - _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+31:i]) - ESAC -ENDFOR - -
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - - - Down-converts the low 8 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Elements are written to memory using writemask "k" (elements are only written when the corresponding mask bit is set; otherwise, the memory location is left unchanged). "hint" indicates to the processor whether the data is non-temporal. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - - IF k[j] - CASE conv OF - _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] - _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) - _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) - _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) - _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+31:i]) - ESAC + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - - Down-converts 8 packed 64-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the load is non-temporal. + Load + + + + + + + Loads 8 double-precision (64-bit) floating-point elements stored at memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" them in "dst". FOR j := 0 to 7 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - - CASE conv OF - _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] - ESAC + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] ENDFOR +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Store - - - - - - - - - Down-converts 8 packed 64-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. + Load + + + + + + + + + Loads 8 double-precision (64-bit) floating-point elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - + m := j*32 IF k[j] - CASE conv OF - _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] - ESAC + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Store - - - - Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in "a" using constant "imm8". The results are stored in "dst". + Load + + + + + + + Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE control[1:0] OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -FOR j := 0 to 3 - i := j*128 - n := j*2 - dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR dst[MAX:512] := 0 - + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Swizzle - - - - - - Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in "a" using constant "imm8". The results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Move + + + + + + + Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -DEFINE SELECT4(src, control) { - CASE control[1:0] OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp[511:0] := 0 -FOR j := 0 to 3 - i := j*128 - n := j*2 - tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n]) -ENDFOR FOR j := 0 to 15 + i := j*32 IF k[j] - dst[i+31:i] := tmp[i+31:i] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0 - -
immintrin.h
-
- - Integer - KNCNI - Load - - - - - Loads 8 32-bit integer memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst". - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:256] := 0 - + + AVX512F
immintrin.h
-
- - Integer - KNCNI - Load - - - - - - - Loads 8 32-bit integer memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Move + + + + + + + Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - Loads 8 single-precision (32-bit) floating-point memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst". + Move + + + + + + + Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI - Load - - - - - - - Loads 8 single-precision (32-bit) floating-point memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Move + + + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. FOR j := 0 to 7 - i := j*32 - m := j*64 + i := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] FI ENDFOR -dst[MAX:256] := 0 + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Store + + - - - - - Stores 8 packed single-precision (32-bit) floating-point elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". + + + Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR +MEM[mem_addr+511:mem_addr] := a[511:0] + + AVX512F
immintrin.h
-
- - Floating Point - KNCNI Store + + - - - - - - Stores 8 packed single-precision (32-bit) floating-point elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are only written to memory when the corresponding mask bit is set). + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI ENDFOR + + AVX512F
immintrin.h
-
- - Integer - KNCNI Store + + - - - - - Stores 8 packed 32-bit integer elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". + + + Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR +MEM[mem_addr+511:mem_addr] := a[511:0] + + AVX512F
immintrin.h
-
- - Integer - KNCNI Store + + - - - - - - Stores 8 packed 32-bit integer elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are only written to memory when the corresponding mask bit is set). + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 +FOR j := 0 to 15 i := j*32 - m := j*64 IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI ENDFOR + + AVX512F
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Move the high element from "k1" to the low element of "k1", and insert the low element of "k2" into the high element of "k1". + Store + + + + + + Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -tmp[7:0] := k1[15:8] -k1[15:8] := k2[7:0] -k1[7:0] := tmp[7:0] +MEM[mem_addr+511:mem_addr] := a[511:0] - + + AVX512F
immintrin.h
-
- - Mask - KNCNI - Mask - - - - Insert the low element of "k2" into the high element of "k1". + Store + + + + + + Store 512-bits of integer data from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -k1[15:8] := k2[7:0] +MEM[mem_addr+511:mem_addr] := a[511:0] - + + AVX512F
immintrin.h
-
- - Integer - LZCNT - Bit Manipulation - - - Count the number of leading zero bits in unsigned 32-bit integer "a", and return that count in "dst". + Store + + + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -tmp := 31 -dst := 0 -DO WHILE (tmp >= 0 AND a[tmp] == 0) - tmp := tmp - 1 - dst := dst + 1 -OD +FOR j := 0 to 7 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR - + + AVX512F
immintrin.h
-
- - Integer - LZCNT - Bit Manipulation - - - Count the number of leading zero bits in unsigned 64-bit integer "a", and return that count in "dst". + Store + + + + + + Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. -tmp := 63 -dst := 0 -DO WHILE (tmp >= 0 AND a[tmp] == 0) - tmp := tmp - 1 - dst := dst + 1 -OD +MEM[mem_addr+511:mem_addr] := a[511:0] - + + AVX512F
immintrin.h
-
- - Integer - MMX - Convert - - - Copy 64-bit integer "a" to "dst". - -dst[63:0] := a[63:0] - - -
mmintrin.h
-
- - Integer - MMX - Convert - - - Copy 64-bit integer "a" to "dst". - -dst[63:0] := a[63:0] - - -
mmintrin.h
-
- - MMX - General Support + Store + + - - Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures. - -
mmintrin.h
-
- - Integer - MMX - Convert - - - Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst". - -dst[31:0] := a[31:0] -dst[63:32] := 0 - - -
mmintrin.h
-
- - Integer - MMX - Convert - - - Copy the lower 32-bit integer in "a" to "dst". - -dst[31:0] := a[31:0] - - -
mmintrin.h
-
- - Integer - MMX - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". - -dst[7:0] := Saturate8(a[15:0]) -dst[15:8] := Saturate8(a[31:16]) -dst[23:16] := Saturate8(a[47:32]) -dst[31:24] := Saturate8(a[63:48]) -dst[39:32] := Saturate8(b[15:0]) -dst[47:40] := Saturate8(b[31:16]) -dst[55:48] := Saturate8(b[47:32]) -dst[63:56] := Saturate8(b[63:48]) - - -
mmintrin.h
-
- - Integer - MMX - Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(b[31:0]) -dst[63:48] := Saturate16(b[63:32]) - - -
mmintrin.h
-
- - Integer - MMX - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". - -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(b[15:0]) -dst[47:40] := SaturateU8(b[31:16]) -dst[55:48] := SaturateU8(b[47:32]) -dst[63:56] := SaturateU8(b[63:48]) - - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) { - dst[7:0] := src1[39:32] - dst[15:8] := src2[39:32] - dst[23:16] := src1[47:40] - dst[31:24] := src2[47:40] - dst[39:32] := src1[55:48] - dst[47:40] := src2[55:48] - dst[55:48] := src1[63:56] - dst[63:56] := src2[63:56] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0]) - - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) { - dst[15:0] := src1[47:32] - dst[31:16] := src2[47:32] - dst[47:32] := src1[63:48] - dst[63:48] := src2[63:48] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0]) - - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -dst[31:0] := a[63:32] -dst[63:32] := b[63:32] +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0]) +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. -DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0]) +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. -dst[31:0] := a[31:0] -dst[63:32] := b[31:0] +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + + Down-converts 16 packed single-precision (32-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". AVX512 only supports _MM_DOWNCONV_PS_NONE. -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) + _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) + _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) + _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) + _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) + ESAC ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + + + Down-converts 16 packed single-precision (32-bit) floating-point elements in "a" according to "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are written only when the corresponding mask bit is not set). AVX512 only supports _MM_DOWNCONV_PS_NONE. -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_DOWNCONV_PS_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i]) + _MM_DOWNCONV_PS_UINT8: MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i]) + _MM_DOWNCONV_PS_SINT8: MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i]) + _MM_DOWNCONV_PS_UINT16: MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i]) + _MM_DOWNCONV_PS_SINT16: MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i]) + ESAC + FI ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + + Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_PD_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + + + Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_DOWNCONV_PD_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC + FI ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + + Down-converts 8 packed 64-bit integer elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + + + Down-converts 8 packed 64-bit integer elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i] + ESAC + FI ENDFOR - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Stores 8 packed double-precision (64-bit) floating-point elements in "a" and to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] ENDFOR - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Arithmetic - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Stores 8 packed double-precision (64-bit) floating-point elements in "a" to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI ENDFOR - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Arithmetic - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + + Down-converts 16 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. AVX512 only supports _MM_DOWNCONV_EPI32_NONE. -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + CASE conv OF + _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+15:i]) + ESAC ENDFOR - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Arithmetic - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + + + Down-converts 16 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Elements are written using writemask "k" (elements are only written when the corresponding mask bit is set; otherwise, elements are left unchanged in memory). "hint" indicates to the processor whether the data is non-temporal. AVX512 only supports _MM_DOWNCONV_EPI32_NONE. -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + + IF k[j] + CASE conv OF + _MM_DOWNCONV_EPI32_NONE: MEM[addr+31:addr] := a[i+31:i] + _MM_DOWNCONV_EPI32_UINT8: MEM[addr+ 7:addr] := Truncate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT8: MEM[addr+ 7:addr] := Saturate8(a[i+31:i]) + _MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i]) + _MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+15:i]) + ESAC + FI ENDFOR - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Arithmetic - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + + AVX512F +
immintrin.h
+ Store +
+ + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] AND b[i+31:i] ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Arithmetic - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) -ENDFOR +dst[511:0] := (a[511:0] AND b[511:0]) +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Arithmetic - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Arithmetic - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 512 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) -ENDFOR +dst[511:0] := ((NOT a[511:0]) AND b[511:0]) +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) -ENDFOR - - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in "a" and then AND with "b", and store the results in "dst". -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[15:0] -ENDFOR +dst[511:0] := ((NOT a[511:0]) AND b[511:0]) +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in "a" and "b", and store the results in "dst". -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI -ENDFOR +dst[511:0] := (a[511:0] AND b[511:0]) +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". -IF count[63:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] << count[63:0]) -FI +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". -IF imm8[7:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] << imm8[7:0]) -FI +dst[511:0] := (a[511:0] OR b[511:0]) +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the resut in "dst". -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + k[j] := 0 FI ENDFOR +k[MAX:16] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[511:0] := (a[511:0] XOR b[511:0]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + dst[i+63:i] := src[i+63:i] FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Reduce the packed 32-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[31:0] AND src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_AND(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + tmp[i+31:i] := 0xFFFFFFFF FI ENDFOR +dst[31:0] := REDUCE_AND(tmp, 16) - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + AVX512F +
immintrin.h
+ Logical +
+ + + + + Reduce the packed 64-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[63:0] AND src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_AND(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF FI ENDFOR +dst[63:0] := REDUCE_AND(tmp, 8) - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + AVX512F +
immintrin.h
+ Logical +
+ + + + + Reduce the packed 32-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". -FOR j := 0 to 1 +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[31:0] OR src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_OR(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 + IF k[j] + tmp[i+31:i] := a[i+31:i] ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + tmp[i+31:i] := 0 FI ENDFOR +dst[31:0] := REDUCE_OR(tmp, 16) - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + AVX512F +
immintrin.h
+ Logical +
+ + + + + Reduce the packed 64-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". -FOR j := 0 to 1 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[63:0] OR src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_OR(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + tmp[i+63:i] := 0 FI ENDFOR +dst[63:0] := REDUCE_OR(tmp, 8) - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". + AVX512F +
immintrin.h
+ Logical +
+ + + + Reduce the packed 32-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". -IF count[63:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] >> count[63:0]) -FI +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[31:0] AND src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_AND(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_AND(a, 16) - -
mmintrin.h
-
- - Floating Point - Integer - MMX - Shift - - - - Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". + AVX512F +
immintrin.h
+ Logical +
+ + + + Reduce the packed 64-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". -IF imm8[7:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] >> imm8[7:0]) -FI +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[63:0] AND src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_AND(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_AND(a, 8) - -
mmintrin.h
-
- - Integer - MMX + AVX512F +
immintrin.h
Logical - - - - Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". +
+ + + + Reduce the packed 32-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". -dst[63:0] := (a[63:0] AND b[63:0]) +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[31:0] OR src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_OR(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_OR(a, 16) - -
mmintrin.h
-
- - Integer - MMX + AVX512F +
immintrin.h
Logical - - - - Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". +
+ + + + Reduce the packed 64-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". -dst[63:0] := ((NOT a[63:0]) AND b[63:0]) +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[63:0] OR src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_OR(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_OR(a, 8) - -
mmintrin.h
-
- - Integer - MMX + AVX512F +
immintrin.h
Logical - - - - Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". +
+ + + + + + + Performs element-by-element bitwise AND between packed 32-bit integer elements of "v2" and "v3", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := (a[63:0] OR b[63:0]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v2[i+31:i] & v3[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Logical - - - - Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := (a[63:0] XOR b[63:0]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - MMX - General Support - - - Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures. - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := Int32(-0x80000000) + FI ENDFOR +dst[31:0] := REDUCE_MAX(tmp, 16) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := Int64(-0x8000000000000000) + FI ENDFOR +dst[63:0] := REDUCE_MAX(tmp, 8) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0 + FI ENDFOR +dst[31:0] := REDUCE_MAX(tmp, 16) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 + FI ENDFOR +dst[63:0] := REDUCE_MAX(tmp, 8) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := Cast_FP64(0xFFEFFFFFFFFFFFFF) + FI ENDFOR +dst[63:0] := REDUCE_MAX(tmp, 8) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := Cast_FP32(0xFF7FFFFF) + FI ENDFOR +dst[31:0] := REDUCE_MAX(tmp, 16) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := Int32(0x7FFFFFFF) + FI ENDFOR +dst[31:0] := REDUCE_MIN(tmp, 16) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := Int64(0x7FFFFFFFFFFFFFFF) + FI ENDFOR +dst[63:0] := REDUCE_MIN(tmp, 8) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". -FOR j := 0 to 1 +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0xFFFFFFFF + FI ENDFOR +dst[31:0] := REDUCE_MIN(tmp, 16) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 64-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF + FI ENDFOR +dst[63:0] := REDUCE_MIN(tmp, 8) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := Cast_FP64(0x7FEFFFFFFFFFFFFF) + FI +ENDFOR +dst[63:0] := REDUCE_MIN(tmp, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := Cast_FP32(0x7F7FFFFF) + FI ENDFOR +dst[31:0] := REDUCE_MIN(tmp, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MAX(a, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MAX(a, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MAX(a, 16) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) -ENDFOR +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MAX(a, 8) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) -ENDFOR +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MAX(a, 8) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) -ENDFOR +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MAX(a, 16) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] -ENDFOR +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MIN(a, 16) - -
mmintrin.h
-
- - Integer - MMX - Arithmetic - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[15:0] -ENDFOR +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MIN(a, 8) - -
mmintrin.h
-
- - Integer - MMX - Shift - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) FI -ENDFOR + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MIN(a, 16) - -
mmintrin.h
-
- - Integer - MMX - Shift - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) FI -ENDFOR + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MIN(a, 8) - -
mmintrin.h
-
- - Integer - MMX - Shift - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MIN(a, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MIN(a, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := 0 @@ -133532,119 +153463,95 @@ FOR j := 0 to 1 dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Shift - - - - Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". - -IF count[63:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] << count[63:0]) -FI - - -
mmintrin.h
-
- - Integer - MMX - Shift - - - - Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". - -IF imm8[7:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] << imm8[7:0]) -FI - - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". +
+ + + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". +
+ + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) +FOR j := 0 to 15 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + dst[i+31:i] := 0 FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) @@ -133652,83 +153559,95 @@ FOR j := 0 to 1 dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". +
+ + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". +
+ + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 +FOR j := 0 to 15 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + dst[i+31:i] := src[i+31:i] FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - +
+ + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 IF imm8[7:0] > 31 dst[i+31:i] := 0 @@ -133736,438 +153655,471 @@ FOR j := 0 to 1 dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - - Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". +
+ + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -IF count[63:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] >> count[63:0]) -FI +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX + + AVX512F +
immintrin.h
Shift - - - - Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". +
+ + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". -IF imm8[7:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] >> imm8[7:0]) -FI +FOR j := 0 to 15 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Logical - - - - Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + + AVX512F +
immintrin.h
+ Shift +
+ + + + Cast vector of type __m512d to type __m512. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512d to type __m512i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m512d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m512i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m512d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m512. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". -dst[63:0] := (a[63:0] AND b[63:0]) +FOR j := 0 to 7 + i := j*32 + n := j*64 + dst[n+63:n] := Convert_FP32_To_FP64(v2[i+31:i]) +ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Logical - - - - Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := ((NOT a[63:0]) AND b[63:0]) +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[l+63:l] := Convert_FP32_To_FP64(v2[i+31:i]) + ELSE + dst[l+63:l] := src[l+63:l] + FI +ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Logical - - - - Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + + AVX512F +
immintrin.h
+ Convert +
+ + + + Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". -dst[63:0] := (a[63:0] OR b[63:0]) +FOR j := 0 to 7 + i := j*32 + l := j*64 + dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) +ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Logical - - - - Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -dst[63:0] := (a[63:0] XOR b[63:0]) +FOR j := 0 to 7 + i := j*32 + n := j*64 + IF k[j] + dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) + ELSE + dst[n+63:n] := src[n+63:n] + FI +ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". + + AVX512F +
immintrin.h
+ Convert +
+ + + + Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 + i := j*32 + n := j*64 + dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) + ELSE + dst[l+63:l] := src[l+63:l] + FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". + + AVX512F +
immintrin.h
+ Convert +
+ + + + Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst". The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 +FOR j := 0 to 7 + i := j*64 + k := j*32 + dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i]) ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". + + AVX512F +
immintrin.h
+ Convert +
+ + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). -FOR j := 0 to 1 +FOR j := 0 to 15 i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 + IF k[j] + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) + ELSE + dst[i+31:i] := src[i+31:i] + FI ENDFOR +dst[MAX:512] := 0 - -
mmintrin.h
-
- - Integer - MMX - Convert - - - Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst". + + KNCNI +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Count the number of leading zero bits in unsigned 32-bit integer "a", and return that count in "dst". -dst[31:0] := a[31:0] -dst[63:32] := 0 +tmp := 31 +dst := 0 +DO WHILE (tmp >= 0 AND a[tmp] == 0) + tmp := tmp - 1 + dst := dst + 1 +OD - -
mmintrin.h
-
- - Integer - MMX - Convert - - - Copy the lower 32-bit integer in "a" to "dst". + + LZCNT +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of leading zero bits in unsigned 64-bit integer "a", and return that count in "dst". -dst[31:0] := a[31:0] +tmp := 63 +dst := 0 +DO WHILE (tmp >= 0 AND a[tmp] == 0) + tmp := tmp - 1 + dst := dst + 1 +OD - -
mmintrin.h
-
- - Integer - MMX - Convert - - + + LZCNT +
immintrin.h
+ Bit Manipulation +
+ + + + + Copy 64-bit integer "a" to "dst". dst[63:0] := a[63:0] - -
mmintrin.h
-
- - Integer + MMX +
mmintrin.h
Convert - - +
+ + + Copy 64-bit integer "a" to "dst". dst[63:0] := a[63:0] - -
mmintrin.h
-
- - Integer + MMX - Set - - - Return vector of type __m64 with all elements set to zero. - -dst[MAX:0] := 0 - -
mmintrin.h
-
- - Integer - MMX - Set - - - - Set packed 32-bit integers in "dst" with the supplied values. + Convert + + + + + Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst". -dst[31:0] := e0 -dst[63:32] := e1 +dst[31:0] := a[31:0] +dst[63:32] := 0 -
mmintrin.h
-
- - Integer + MMX - Set - - - - - - Set packed 16-bit integers in "dst" with the supplied values. - -dst[15:0] := e0 -dst[31:16] := e1 -dst[47:32] := e2 -dst[63:48] := e3 -
mmintrin.h
-
- - Integer - MMX - Set - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values. + Convert + + + + + Copy the lower 32-bit integer in "a" to "dst". -dst[7:0] := e0 -dst[15:8] := e1 -dst[23:16] := e2 -dst[31:24] := e3 -dst[39:32] := e4 -dst[47:40] := e5 -dst[55:48] := e6 -dst[63:56] := e7 +dst[31:0] := a[31:0] -
mmintrin.h
-
- - Integer + MMX - Set - - - Broadcast 32-bit integer "a" to all elements of "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -
mmintrin.h
-
- - Integer - MMX - Set - - - Broadcast 16-bit integer "a" to all all elements of "dst". + Convert + + + + + Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR +dst[31:0] := a[31:0] +dst[63:32] := 0 -
mmintrin.h
-
- - Integer + MMX - Set - - - Broadcast 8-bit integer "a" to all elements of "dst". +
mmintrin.h
+ Convert +
+ + + + Copy the lower 32-bit integer in "a" to "dst". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR +dst[31:0] := a[31:0] -
mmintrin.h
-
- - Integer + MMX - Set - - - - Set packed 32-bit integers in "dst" with the supplied values in reverse order. +
mmintrin.h
+ Convert +
+ + + + Copy 64-bit integer "a" to "dst". -dst[31:0] := e1 -dst[63:32] := e0 +dst[63:0] := a[63:0] -
mmintrin.h
-
- - Integer + MMX - Set - - - - - - Set packed 16-bit integers in "dst" with the supplied values in reverse order. +
mmintrin.h
+ Convert +
+ + + + Copy 64-bit integer "a" to "dst". -dst[15:0] := e3 -dst[31:16] := e2 -dst[47:32] := e1 -dst[63:48] := e0 +dst[63:0] := a[63:0] + + MMX
mmintrin.h
-
- - Integer + Convert + + + + + Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures. + MMX - Set - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values in reverse order. - -dst[7:0] := e7 -dst[15:8] := e6 -dst[23:16] := e5 -dst[31:24] := e4 -dst[39:32] := e3 -dst[47:40] := e2 -dst[55:48] := e1 -dst[63:56] := e0 -
mmintrin.h
-
- - Integer + General Support + + + + + Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures. + MMX - Miscellaneous - - - +
mmintrin.h
+ General Support +
+ + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". dst[7:0] := Saturate8(a[15:0]) @@ -134176,4549 +154128,4125 @@ dst[23:16] := Saturate8(a[47:32]) dst[31:24] := Saturate8(a[63:48]) dst[39:32] := Saturate8(b[15:0]) dst[47:40] := Saturate8(b[31:16]) -dst[55:48] := Saturate8(b[47:32]) -dst[63:56] := Saturate8(b[63:48]) - - -
mmintrin.h
-
- - Integer - MMX - Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(b[31:0]) -dst[63:48] := Saturate16(b[63:32]) - - -
mmintrin.h
-
- - Integer - MMX - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". - -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(b[15:0]) -dst[47:40] := SaturateU8(b[31:16]) -dst[55:48] := SaturateU8(b[47:32]) -dst[63:56] := SaturateU8(b[63:48]) - - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) { - dst[7:0] := src1[39:32] - dst[15:8] := src2[39:32] - dst[23:16] := src1[47:40] - dst[31:24] := src2[47:40] - dst[39:32] := src1[55:48] - dst[47:40] := src2[55:48] - dst[55:48] := src1[63:56] - dst[63:56] := src2[63:56] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0]) - - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) { - dst[15:0] := src1[47:32] - dst[31:16] := src2[47:32] - dst[47:32] := src1[63:48] - dst[63:48] := src2[63:48] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0]) - - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". - -dst[31:0] := a[63:32] -dst[63:32] := b[63:32] - - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0]) - - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0]) - - -
mmintrin.h
-
- - Integer - MMX - Swizzle - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". - -dst[31:0] := a[31:0] -dst[63:32] := b[31:0] - - -
mmintrin.h
-
- - MONITOR - General Support - - - - - Arm address monitoring hardware using the address specified in "p". A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in "extensions", and optional hints in "hints". - -
pmmintrin.h
-
- - MONITOR - General Support - - - - Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR. - -
pmmintrin.h
-
- - MOVBE - Load - - - Load 16 bits from memory, perform a byte swap operation, and store the result in "dst". - -FOR j := 0 to 1 - i := j*8 - dst[i+7:i] := MEM[ptr+15-i:ptr+8-i] -ENDFOR - - -
immintrin.h
-
- - MOVBE - Load - - - Load 32 bits from memory, perform a byte swap operation, and store the result in "dst". - -FOR j := 0 to 3 - i := j*8 - dst[i+7:i] := MEM[ptr+31-i:ptr+24-i] -ENDFOR - - -
immintrin.h
-
- - MOVBE - Load - - - Load 64 bits from memory, perform a byte swap operation, and store the result in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := MEM[ptr+63-i:ptr+56-i] -ENDFOR - - -
immintrin.h
-
- - MOVBE - Store - - - - Perform a bit swap operation of the 16 bits in "data", and store the results to memory. - -FOR j := 0 to 1 - i := j*8 - MEM[ptr+i+7:ptr+i] := data[15-i:8-i] -ENDFOR - - -
immintrin.h
-
- - MOVBE - Store - - - - Perform a bit swap operation of the 32 bits in "data", and store the results to memory. - -addr := MEM[ptr] -FOR j := 0 to 3 - i := j*8 - MEM[ptr+i+7:ptr+i] := data[31-i:24-i] -ENDFOR - - -
immintrin.h
-
- - MOVBE - Store - - - - Perform a bit swap operation of the 64 bits in "data", and store the results to memory. - -addr := MEM[ptr] -FOR j := 0 to 7 - i := j*8 - MEM[ptr+i+7:ptr+i] := data[63-i:56-i] -ENDFOR - - -
immintrin.h
-
- - MOVDIR64B - Store - - - - Move 64-byte (512-bit) value using direct store from source memory address "src" to destination memory address "dst". - -MEM[dst+511:dst] := MEM[src+511:src] - - -
immintrin.h
-
- - MOVDIRI - Store - - - - Store 64-bit integer from "val" into memory using direct store. - -MEM[dst+63:dst] := val[63:0] - - -
immintrin.h
-
- - MOVDIRI - Store - - - - Store 32-bit integer from "val" into memory using direct store. - -MEM[dst+31:dst] := val[31:0] - - -
immintrin.h
-
- - MPX - Miscellaneous - - - - Make a pointer with the value of "srcmem" and bounds set to ["srcmem", "srcmem" + "size" - 1], and store the result in "dst". - dst := srcmem -dst.LB := srcmem.LB -dst.UB := srcmem + size - 1 - - -
immintrin.h
-
- - MPX - Miscellaneous - - - - - Narrow the bounds for pointer "q" to the intersection of the bounds of "r" and the bounds ["q", "q" + "size" - 1], and store the result in "dst". - dst := q -IF r.LB > (q + size - 1) OR r.UB < q - dst.LB := 1 - dst.UB := 0 -ELSE - dst.LB := MAX(r.LB, q) - dst.UB := MIN(r.UB, (q + size - 1)) -FI - -
immintrin.h
-
- - MPX - Miscellaneous - - - - Make a pointer with the value of "q" and bounds set to the bounds of "r" (e.g. copy the bounds of "r" to pointer "q"), and store the result in "dst". - dst := q -dst.LB := r.LB -dst.UB := r.UB - -
immintrin.h
-
- - MPX - Miscellaneous - - - Make a pointer with the value of "q" and open bounds, which allow the pointer to access the entire virtual address space, and store the result in "dst". - dst := q -dst.LB := 0 -dst.UB := 0 - -
immintrin.h
-
- - MPX - Miscellaneous - - - - Stores the bounds of "ptr_val" pointer in memory at address "ptr_addr". - MEM[ptr_addr].LB := ptr_val.LB -MEM[ptr_addr].UB := ptr_val.UB - - -
immintrin.h
-
- - MPX - Miscellaneous - - - Checks if "q" is within its lower bound, and throws a #BR if not. - IF q < q.LB - #BR -FI - - -
immintrin.h
-
- - MPX - Miscellaneous - - - Checks if "q" is within its upper bound, and throws a #BR if not. - IF q > q.UB - #BR -FI - - - -
immintrin.h
-
- - MPX - Miscellaneous - - - - Checks if ["q", "q" + "size" - 1] is within the lower and upper bounds of "q" and throws a #BR if not. - IF (q + size - 1) < q.LB OR (q + size - 1) > q.UB - #BR -FI - - - -
immintrin.h
-
- - MPX - Miscellaneous - - - Return the lower bound of "q". - dst := q.LB +dst[55:48] := Saturate8(b[47:32]) +dst[63:56] := Saturate8(b[63:48]) -
immintrin.h
-
- - MPX + + MMX +
mmintrin.h
Miscellaneous - - - Return the upper bound of "q". - dst := q.UB - -
immintrin.h
-
- - Integer - Bit Manipulation - - - Set "dst" to the index of the lowest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined. + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". -tmp := 0 -IF a == 0 - // dst is undefined -ELSE - DO WHILE ((tmp < 32) AND a[tmp] == 0) - tmp := tmp + 1 - OD -FI -dst := tmp +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(b[31:0]) +dst[63:48] := Saturate16(b[63:32]) - -
immintrin.h
-
- - Integer - Bit Manipulation - - - Set "dst" to the index of the highest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined. + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". -tmp := 31 -IF a == 0 - // dst is undefined -ELSE - DO WHILE ((tmp > 0) AND a[tmp] == 0) - tmp := tmp - 1 - OD -FI -dst := tmp +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(b[15:0]) +dst[47:40] := SaturateU8(b[31:16]) +dst[55:48] := SaturateU8(b[47:32]) +dst[63:56] := SaturateU8(b[63:48]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". -tmp := 0 -IF a == 0 - // MEM[index+31:index] is undefined - dst := 0 -ELSE - DO WHILE ((tmp < 32) AND a[tmp] == 0) - tmp := tmp + 1 - OD - MEM[index+31:index] := tmp - dst := (tmp == 31) ? 0 : 1 -FI +dst[7:0] := Saturate8(a[15:0]) +dst[15:8] := Saturate8(a[31:16]) +dst[23:16] := Saturate8(a[47:32]) +dst[31:24] := Saturate8(a[63:48]) +dst[39:32] := Saturate8(b[15:0]) +dst[47:40] := Saturate8(b[31:16]) +dst[55:48] := Saturate8(b[47:32]) +dst[63:56] := Saturate8(b[63:48]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". -tmp := 31 -IF a == 0 - // MEM[index+31:index] is undefined - dst := 0 -ELSE - DO WHILE ((tmp > 0) AND a[tmp] == 0) - tmp := tmp - 1 - OD - MEM[index+31:index] := tmp - dst := (tmp == 0) ? 0 : 1 -FI +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(b[31:0]) +dst[63:48] := Saturate16(b[63:32]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". -tmp := 0 -IF a == 0 - // MEM[index+31:index] is undefined - dst := 0 -ELSE - DO WHILE ((tmp < 64) AND a[tmp] == 0) - tmp := tmp + 1 - OD - MEM[index+31:index] := tmp - dst := (tmp == 63) ? 0 : 1 -FI +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(b[15:0]) +dst[47:40] := SaturateU8(b[31:16]) +dst[55:48] := SaturateU8(b[47:32]) +dst[63:56] := SaturateU8(b[63:48]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". -tmp := 63 -IF a == 0 - // MEM[index+31:index] is undefined - dst := 0 -ELSE - DO WHILE ((tmp > 0) AND a[tmp] == 0) - tmp := tmp - 1 - OD - MEM[index+31:index] := tmp - dst := (tmp == 0) ? 0 : 1 -FI +DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) { + dst[7:0] := src1[39:32] + dst[15:8] := src2[39:32] + dst[23:16] := src1[47:40] + dst[31:24] := src2[47:40] + dst[39:32] := src1[55:48] + dst[47:40] := src2[55:48] + dst[55:48] := src1[63:56] + dst[63:56] := src2[63:56] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Return the bit at index "b" of 32-bit integer "a". + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". -addr := a + ZeroExtend64(b) -dst[0] := MEM[addr] +DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) { + dst[15:0] := src1[47:32] + dst[31:16] := src2[47:32] + dst[47:32] := src1[63:48] + dst[63:48] := src2[63:48] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Return the bit at index "b" of 32-bit integer "a", and set that bit to its complement. + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". -addr := a + ZeroExtend64(b) -dst[0] := MEM[addr] -MEM[addr] := ~dst[0] +dst[31:0] := a[63:32] +dst[63:32] := b[63:32] - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Return the bit at index "b" of 32-bit integer "a", and set that bit to zero. + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". -addr := a + ZeroExtend64(b) -dst[0] := MEM[addr] -MEM[addr] := 0 +DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Return the bit at index "b" of 32-bit integer "a", and set that bit to one. + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". -addr := a + ZeroExtend64(b) -dst[0] := MEM[addr] -MEM[addr] := 1 +DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Return the bit at index "b" of 64-bit integer "a". + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". -addr := a + b -dst[0] := MEM[addr] +dst[31:0] := a[31:0] +dst[63:32] := b[31:0] - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Return the bit at index "b" of 64-bit integer "a", and set that bit to its complement. + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". -addr := a + b -dst[0] := MEM[addr] -MEM[addr] := ~dst[0] +DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) { + dst[7:0] := src1[39:32] + dst[15:8] := src2[39:32] + dst[23:16] := src1[47:40] + dst[31:24] := src2[47:40] + dst[39:32] := src1[55:48] + dst[47:40] := src2[55:48] + dst[55:48] := src1[63:56] + dst[63:56] := src2[63:56] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Return the bit at index "b" of 64-bit integer "a", and set that bit to zero. + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". -addr := a + b -dst[0] := MEM[addr] -MEM[addr] := 0 +DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) { + dst[15:0] := src1[47:32] + dst[31:16] := src2[47:32] + dst[47:32] := src1[63:48] + dst[63:48] := src2[63:48] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0]) - -
immintrin.h
-
- - Integer - Flag - Bit Manipulation - - - - Return the bit at index "b" of 64-bit integer "a", and set that bit to one. + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". -addr := a + b -dst[0] := MEM[addr] -MEM[addr] := 1 +dst[31:0] := a[63:32] +dst[63:32] := b[63:32] - -
immintrin.h
-
- - Integer - Bit Manipulation - - - Reverse the byte order of 32-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values. + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". -dst[7:0] := a[31:24] -dst[15:8] := a[23:16] -dst[23:16] := a[15:8] -dst[31:24] := a[7:0] +DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0]) - -
immintrin.h
-
- - Integer - Bit Manipulation - - - Reverse the byte order of 64-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values. + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". -dst[7:0] := a[63:56] -dst[15:8] := a[55:48] -dst[23:16] := a[47:40] -dst[31:24] := a[39:32] -dst[39:32] := a[31:24] -dst[47:40] := a[23:16] -dst[55:48] := a[15:8] -dst[63:56] := a[7:0] - - -
immintrin.h
-
- - Floating Point - Integer - Cast - - - Cast from type float to type unsigned __int32 without conversion. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - Integer - Cast - - - Cast from type double to type unsigned __int64 without conversion. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - Integer - Cast - - - Cast from type unsigned __int32 to type float without conversion. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Floating Point - Integer - Cast - - - Cast from type unsigned __int64 to type double without conversion. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
-
- - Integer - Shift - - - - Shift the bits of unsigned long integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". - // size := 32 or 64 -dst := a -count := shift AND (size - 1) -DO WHILE (count > 0) - tmp[0] := dst[size - 1] - dst := (dst << 1) OR tmp[0] - count := count - 1 -OD +DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0]) - - -
immintrin.h
-
- - Integer - Shift - - - - Shift the bits of unsigned long integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". - // size := 32 or 64 -dst := a -count := shift AND (size - 1) -DO WHILE (count > 0) - tmp[size - 1] := dst[0] - dst := (dst >> 1) OR tmp[size - 1] - count := count - 1 -OD + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". + +dst[31:0] := a[31:0] +dst[63:32] := b[31:0] - -
immintrin.h
-
- - General Support - - - Treat the processor-specific feature(s) specified in "a" as available. Multiple features may be OR'd together. See the valid feature flags below: + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". -_FEATURE_GENERIC_IA32 -_FEATURE_FPU -_FEATURE_CMOV -_FEATURE_MMX -_FEATURE_FXSAVE -_FEATURE_SSE -_FEATURE_SSE2 -_FEATURE_SSE3 -_FEATURE_SSSE3 -_FEATURE_SSE4_1 -_FEATURE_SSE4_2 -_FEATURE_MOVBE -_FEATURE_POPCNT -_FEATURE_PCLMULQDQ -_FEATURE_AES -_FEATURE_F16C -_FEATURE_AVX -_FEATURE_RDRND -_FEATURE_FMA -_FEATURE_BMI -_FEATURE_LZCNT -_FEATURE_HLE -_FEATURE_RTM -_FEATURE_AVX2 -_FEATURE_KNCNI -_FEATURE_AVX512F -_FEATURE_ADX -_FEATURE_RDSEED -_FEATURE_AVX512ER -_FEATURE_AVX512PF -_FEATURE_AVX512CD -_FEATURE_SHA -_FEATURE_MPX -_FEATURE_AVX512BW -_FEATURE_AVX512VL -_FEATURE_AVX512VBMI -_FEATURE_AVX512_4FMAPS -_FEATURE_AVX512_4VNNIW -_FEATURE_AVX512_VPOPCNTDQ -_FEATURE_AVX512_BITALG -_FEATURE_AVX512_VBMI2 -_FEATURE_GFNI -_FEATURE_VAES -_FEATURE_VPCLMULQDQ -_FEATURE_AVX512_VNNI -_FEATURE_CLWB -_FEATURE_RDPID -_FEATURE_IBT -_FEATURE_SHSTK -_FEATURE_SGX -_FEATURE_WBNOINVD -_FEATURE_PCONFIG -_FEATURE_AXV512_4VNNIB -_FEATURE_AXV512_4FMAPH -_FEATURE_AXV512_BITALG2 -_FEATURE_AXV512_VP2INTERSECT +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] +ENDFOR -
immintrin.h
-
- - General Support - - - Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This intrinsic does not check the processor vendor. See the valid feature flags below: + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". -_FEATURE_GENERIC_IA32 -_FEATURE_FPU -_FEATURE_CMOV -_FEATURE_MMX -_FEATURE_FXSAVE -_FEATURE_SSE -_FEATURE_SSE2 -_FEATURE_SSE3 -_FEATURE_SSSE3 -_FEATURE_SSE4_1 -_FEATURE_SSE4_2 -_FEATURE_MOVBE -_FEATURE_POPCNT -_FEATURE_PCLMULQDQ -_FEATURE_AES -_FEATURE_F16C -_FEATURE_AVX -_FEATURE_RDRND -_FEATURE_FMA -_FEATURE_BMI -_FEATURE_LZCNT -_FEATURE_HLE -_FEATURE_RTM -_FEATURE_AVX2 -_FEATURE_KNCNI -_FEATURE_AVX512F -_FEATURE_ADX -_FEATURE_RDSEED -_FEATURE_AVX512ER -_FEATURE_AVX512PF -_FEATURE_AVX512CD -_FEATURE_SHA -_FEATURE_MPX -_FEATURE_AVX512BW -_FEATURE_AVX512VL -_FEATURE_AVX512VBMI -_FEATURE_AVX512_4FMAPS -_FEATURE_AVX512_4VNNIW -_FEATURE_AVX512_VPOPCNTDQ -_FEATURE_AVX512_BITALG -_FEATURE_AVX512_VBMI2 -_FEATURE_GFNI -_FEATURE_VAES -_FEATURE_VPCLMULQDQ -_FEATURE_AVX512_VNNI -_FEATURE_CLWB -_FEATURE_RDPID -_FEATURE_IBT -_FEATURE_SHSTK -_FEATURE_SGX -_FEATURE_WBNOINVD -_FEATURE_PCONFIG -_FEATURE_AXV512_4VNNIB -_FEATURE_AXV512_4FMAPH -_FEATURE_AXV512_BITALG2 -_FEATURE_AXV512_VP2INTERSECT +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] +ENDFOR -
immintrin.h
-
- - General Support - - - Read the Performance Monitor Counter (PMC) specified by "a", and store up to 64-bits in "dst". The width of performance counters is implementation specific. - dst[63:0] := ReadPMC(a) + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR - -
immintrin.h
-
- - Integer - Shift - - - - Shift the bits of unsigned 32-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". -dst := a -count := shift AND 31 -DO WHILE (count > 0) - tmp[0] := dst[31] - dst := (dst << 1) OR tmp[0] - count := count - 1 -OD +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +ENDFOR - -
immintrin.h
-
- - Integer - Shift - - - - Shift the bits of unsigned 32-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". -dst := a -count := shift AND 31 -DO WHILE (count > 0) - tmp[31] := dst[0] - dst := (dst >> 1) OR tmp - count := count - 1 -OD +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR - -
immintrin.h
-
- - Integer - Shift - - - - Shift the bits of unsigned 16-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". -dst := a -count := shift AND 15 -DO WHILE (count > 0) - tmp[0] := dst[15] - dst := (dst << 1) OR tmp[0] - count := count - 1 -OD +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +ENDFOR - -
immintrin.h
-
- - Integer - Shift - - - - Shift the bits of unsigned 16-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". -dst := a -count := shift AND 15 -DO WHILE (count > 0) - tmp[15] := dst[0] - dst := (dst >> 1) OR tmp - count := count - 1 -OD +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +ENDFOR - -
immintrin.h
-
- - Integer - Shift - - - - Shift the bits of unsigned 64-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". -dst := a -count := shift AND 63 -DO WHILE (count > 0) - tmp[0] := dst[63] - dst := (dst << 1) OR tmp[0] - count := count - 1 -OD +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] +ENDFOR - -
immintrin.h
-
- - Integer - Shift - - - - Shift the bits of unsigned 64-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". -dst := a -count := shift AND 63 -DO WHILE (count > 0) - tmp[63] := dst[0] - dst := (dst >> 1) OR tmp[63] - count := count - 1 -OD +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[i+15:i] - b[i+15:i] +ENDFOR - -
immintrin.h
-
- - Integer - Flag + + MMX +
mmintrin.h
Arithmetic - - - - - - Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). +
+ + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". -tmp[32:0] := a[31:0] + b[31:0] + (c_in > 0 ? 1 : 0) -MEM[out+31:out] := tmp[31:0] -dst[0] := tmp[32] -dst[7:1] := 0 +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR - -
immintrin.h
-
- - Integer - Flag + + MMX +
mmintrin.h
Arithmetic - - - - - - Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). +
+ + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". -tmp[64:0] := a[63:0] + b[63:0] + (c_in > 0 ? 1 : 0) -MEM[out+63:out] := tmp[63:0] -dst[0] := tmp[64] -dst[7:1] := 0 +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +ENDFOR - -
immintrin.h
-
- - Integer - Flag + + MMX +
mmintrin.h
Arithmetic - - - - - - Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 32-bit integer "b", and subtract the result from unsigned 32-bit integer "a". Store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". -tmp[32:0] := a[31:0] - (b[31:0] + (c_in > 0 ? 1 : 0)) -MEM[out+31:out] := tmp[31:0] -dst[0] := tmp[32] -dst[7:1] := 0 +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) +ENDFOR - -
immintrin.h
-
- - Integer - Flag + + MMX +
mmintrin.h
Arithmetic - - - - - - Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 64-bit integer "b", and subtract the result from unsigned 64-bit integer "a". Store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). +
+ + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". -tmp[64:0] := a[63:0] - (b[63:0] + (c_in > 0 ? 1 : 0)) -MEM[out+63:out] := tmp[63:0] -dst[0] := tmp[64] -dst[7:1] := 0 +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +ENDFOR - -
immintrin.h
-
- - Miscellaneous - - - Insert the 32-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled. - -
immintrin.h
-
- - Miscellaneous - - - Insert the 64-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled. - -
immintrin.h
-
- - Miscellaneous - - - - Invoke the Intel SGX enclave user (non-privilege) leaf function specified by "a", and return the error code. The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. - -
immintrin.h
-
- - Miscellaneous - - - - Invoke the Intel SGX enclave system (privileged) leaf function specified by "a", and return the error code. The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. - -
immintrin.h
-
- - Miscellaneous - - - - Invoke the Intel SGX enclave virtualized (VMM) leaf function specified by "a", and return the error code. The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. - -
immintrin.h
-
- - Miscellaneous - - - Write back and flush internal caches. - Initiate writing-back and flushing of external - caches. - -
immintrin.h
-
- - Floating Point - Convert - - - Convert the half-precision (16-bit) floating-point value "a" to a single-precision (32-bit) floating-point value, and store the result in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". -dst[31:0] := Convert_FP16_To_FP32(a[15:0]) +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +ENDFOR -
emmintrin.h
-
- - Floating Point - Convert - - - - Convert the single-precision (32-bit) floating-point value "a" to a half-precision (16-bit) floating-point value, and store the result in "dst". - [round_note] + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". -dst[15:0] := Convert_FP32_To_FP16(a[31:0]) +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +ENDFOR -
emmintrin.h
-
- - Integer - PCLMULQDQ - Application-Targeted - - - - - Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". -IF (imm8[0] == 0) - TEMP1 := a[63:0] -ELSE - TEMP1 := a[127:64] -FI -IF (imm8[4] == 0) - TEMP2 := b[63:0] -ELSE - TEMP2 := b[127:64] -FI -FOR i := 0 to 63 - TEMP[i] := (TEMP1[0] and TEMP2[i]) - FOR j := 1 to i - TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j]) - ENDFOR - dst[i] := TEMP[i] +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] ENDFOR -FOR i := 64 to 127 - TEMP[i] := 0 - FOR j := (i - 63) to 63 - TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j]) - ENDFOR - dst[i] := TEMP[i] + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[15:0] ENDFOR -dst[127] := 0 - -
wmmintrin.h
-
- - PCONFIG - Miscellaneous - - - - Invoke the PCONFIG leaf function specified by "a". The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to rbx, rcx, and rdx. May return the value in eax, depending on the semantics of the specified leaf function. - -
immintrin.h
-
- - Integer - Flag - POPCNT - Bit Manipulation - - - Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". -dst := 0 -FOR i := 0 to 31 - IF a[i] - dst := dst + 1 - FI +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] ENDFOR - -
nmmintrin.h
-
- - Integer - Flag - POPCNT - Bit Manipulation - - - Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". -dst := 0 -FOR i := 0 to 63 - IF a[i] - dst := dst + 1 - FI +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] ENDFOR - -
nmmintrin.h
-
- - Integer - Flag - POPCNT - Bit Manipulation - - - Count the number of bits set to 1 in 32-bit integer "a", and return that count in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". -dst := 0 -FOR i := 0 to 31 - IF a[i] - dst := dst + 1 - FI +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR - -
immintrin.h
-
- - Integer - Flag - POPCNT - Bit Manipulation - - - Count the number of bits set to 1 in 64-bit integer "a", and return that count in "dst". + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". -dst := 0 -FOR i := 0 to 63 - IF a[i] - dst := dst + 1 - FI +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) ENDFOR - -
immintrin.h
-
- - PREFETCHWT1 - General Support - - - - Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i". - -
xmmintrin.h
-
- - RDPID - General Support - - - Copy the IA32_TSC_AUX MSR (signature value) into "dst". - dst[31:0] := IA32_TSC_AUX[31:0] + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR - -
immintrin.h
-
- - Integer - Flag - RDRAND - Random - - - Read a hardware generated 16-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_RND_GEN.ready == 1 - val[15:0] := HW_RND_GEN.data - dst := 1 -ELSE - val[15:0] := 0 - dst := 0 -FI + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +ENDFOR - -
immintrin.h
-
- - Integer - Flag - RDRAND - Random - - - Read a hardware generated 32-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_RND_GEN.ready == 1 - val[31:0] := HW_RND_GEN.data - dst := 1 -ELSE - val[31:0] := 0 - dst := 0 -FI + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +ENDFOR - -
immintrin.h
-
- - Integer - Flag - RDRAND - Random - - - Read a hardware generated 64-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_RND_GEN.ready == 1 - val[63:0] := HW_RND_GEN.data - dst := 1 -ELSE - val[63:0] := 0 - dst := 0 -FI + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] +ENDFOR - -
immintrin.h
-
- - Flag - RDSEED - Random - - - Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_NRND_GEN.ready == 1 - val[15:0] := HW_NRND_GEN.data - dst := 1 -ELSE - val[15:0] := 0 - dst := 0 -FI + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[i+15:i] - b[i+15:i] +ENDFOR - -
immintrin.h
-
- - Flag - RDSEED - Random - - - Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_NRND_GEN.ready == 1 - val[31:0] := HW_NRND_GEN.data - dst := 1 -ELSE - val[31:0] := 0 - dst := 0 -FI + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +ENDFOR - -
immintrin.h
-
- - Flag - RDSEED - Random - - - Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_NRND_GEN.ready == 1 - val[63:0] := HW_NRND_GEN.data - dst := 1 -ELSE - val[63:0] := 0 - dst := 0 -FI + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] +ENDFOR - -
immintrin.h
-
- - RDTSCP - General Support - - - Copy the current 64-bit value of the processor's time-stamp counter into "dst", and store the IA32_TSC_AUX MSR (signature value) into memory at "mem_addr". - dst[63:0] := TimeStampCounter -MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0] + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[15:0] +ENDFOR - -
immintrin.h
-
- - RTM - General Support - - - Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the "imm8" parameter will be provided in bits [31:24] of EAX. - Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction. - IF RTM_ACTIVE == 0 - // nop -ELSE - // restore architectural register state - // discard memory updates performed in transaction - // update EAX with status and imm8 value - eax[31:24] := imm8[7:0] - RTM_NEST_COUNT := 0 - RTM_ACTIVE := 0 - IF _64_BIT_MODE - RIP := fallbackRIP + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 ELSE - EIP := fallbackEIP + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) FI -FI +ENDFOR - -
immintrin.h
-
- - RTM - General Support - - - Specify the start of an RTM code region. - If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution. - On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction. Return status of ~0 (0xFFFF) if continuing inside transaction; all other codes are aborts. - IF RTM_NEST_COUNT < MAX_RTM_NEST_COUNT - RTM_NEST_COUNT := RTM_NEST_COUNT + 1 - IF RTM_NEST_COUNT == 1 - IF _64_BIT_MODE - fallbackRIP := RIP - ELSE IF _32_BIT_MODE - fallbackEIP := EIP - FI - - RTM_ACTIVE := 1 - // enter RTM execution, record register state, start tracking memory state + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) FI -ELSE - // RTM abort (see _xabort) -FI +ENDFOR - -
immintrin.h
-
- - RTM - General Support - - - Specify the end of an RTM code region. - If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically. - If the commit fails, the logical processor will perform an RTM abort. - IF RTM_ACTIVE == 1 - RTM_NEST_COUNT := RTM_NEST_COUNT - 1 - IF RTM_NEST_COUNT == 0 - // try to commit transaction - IF FAIL_TO_COMMIT_TRANSACTION - // RTM abort (see _xabort) - ELSE - RTM_ACTIVE := 0 - FI + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) FI -FI +ENDFOR - -
immintrin.h
-
- - RTM - General Support - - - Query the transactional execution status, return 1 if inside a transactionally executing RTM or HLE region, and return 0 otherwise. - IF (RTM_ACTIVE == 1 OR HLE_ACTIVE == 1) - dst := 1 + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". + +IF count[63:0] > 63 + dst[63:0] := 0 ELSE - dst := 0 + dst[63:0] := ZeroExtend64(a[63:0] << count[63:0]) FI - -
immintrin.h
-
- - SERIALIZE - General Support - - Serialize instruction execution, ensuring all modifications to flags, registers, and memory by previous instructions are completed before the next instruction is fetched. - -
immintrin.h
-
- - Integer - SHA - Cryptography - - - - Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". -W0 := a[127:96] -W1 := a[95:64] -W2 := a[63:32] -W3 := a[31:0] -W4 := b[127:96] -W5 := b[95:64] -dst[127:96] := W2 XOR W0 -dst[95:64] := W3 XOR W1 -dst[63:32] := W4 XOR W2 -dst[31:0] := W5 XOR W3 +IF imm8[7:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] << imm8[7:0]) +FI - -
immintrin.h
-
- - Integer - SHA - Cryptography - - - - Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in "a" and the previous message values in "b", and store the result in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". -W13 := b[95:64] -W14 := b[63:32] -W15 := b[31:0] -W16 := (a[127:96] XOR W13) <<< 1 -W17 := (a[95:64] XOR W14) <<< 1 -W18 := (a[63:32] XOR W15) <<< 1 -W19 := (a[31:0] XOR W16) <<< 1 -dst[127:96] := W16 -dst[95:64] := W17 -dst[63:32] := W18 -dst[31:0] := W19 +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR - -
immintrin.h
-
- - Integer - SHA - Cryptography - - - - Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable "a", add that value to the scheduled values (unsigned 32-bit integers) in "b", and store the result in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". -tmp := (a[127:96] <<< 30) -dst[127:96] := b[127:96] + tmp -dst[95:64] := b[95:64] -dst[63:32] := b[63:32] -dst[31:0] := b[31:0] - - -
immintrin.h
-
- - Integer - SHA - Cryptography - - - - - Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from "a" and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from "b", and store the updated SHA1 state (A,B,C,D) in "dst". "func" contains the logic functions and round constants. - IF (func[1:0] == 0) - f := f0() - K := K0 -ELSE IF (func[1:0] == 1) - f := f1() - K := K1 -ELSE IF (func[1:0] == 2) - f := f2() - K := K2 -ELSE IF (func[1:0] == 3) - f := f3() - K := K3 -FI -A := a[127:96] -B := a[95:64] -C := a[63:32] -D := a[31:0] -W[0] := b[127:96] -W[1] := b[95:64] -W[2] := b[63:32] -W[3] := b[31:0] -A[1] := f(B, C, D) + (A <<< 5) + W[0] + K -B[1] := A -C[1] := B <<< 30 -D[1] := C -E[1] := D -FOR i := 1 to 3 - A[i+1] := f(B[i], C[i], D[i]) + (A[i] <<< 5) + W[i] + E[i] + K - B[i+1] := A[i] - C[i+1] := B[i] <<< 30 - D[i+1] := C[i] - E[i+1] := D[i] +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI ENDFOR -dst[127:96] := A[4] -dst[95:64] := B[4] -dst[63:32] := C[4] -dst[31:0] := D[4] - -
immintrin.h
-
- - Integer - SHA - Cryptography - - - - Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst". - W4 := b[31:0] -W3 := a[127:96] -W2 := a[95:64] -W1 := a[63:32] -W0 := a[31:0] -dst[127:96] := W3 + sigma0(W4) -dst[95:64] := W2 + sigma0(W3) -dst[63:32] := W1 + sigma0(W2) -dst[31:0] := W0 + sigma0(W1) + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR - -
immintrin.h
-
- - Integer - SHA - Cryptography - - - - Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst"." - W14 := b[95:64] -W15 := b[127:96] -W16 := a[31:0] + sigma1(W14) -W17 := a[63:32] + sigma1(W15) -W18 := a[95:64] + sigma1(W16) -W19 := a[127:96] + sigma1(W17) -dst[127:96] := W19 -dst[95:64] := W18 -dst[63:32] := W17 -dst[31:0] := W16 + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR - -
immintrin.h
-
- - Integer - SHA - Cryptography - - - - - Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from "a", an initial SHA256 state (A,B,E,F) from "b", and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from "k", and store the updated SHA256 state (A,B,E,F) in "dst". - A[0] := b[127:96] -B[0] := b[95:64] -C[0] := a[127:96] -D[0] := a[95:64] -E[0] := b[63:32] -F[0] := b[31:0] -G[0] := a[63:32] -H[0] := a[31:0] -W_K[0] := k[31:0] -W_K[1] := k[63:32] -FOR i := 0 to 1 - A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]) - B[i+1] := A[i] - C[i+1] := B[i] - D[i+1] := C[i] - E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i] - F[i+1] := E[i] - G[i+1] := F[i] - H[i+1] := G[i] + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI ENDFOR -dst[127:96] := A[2] -dst[95:64] := B[2] -dst[63:32] := E[2] -dst[31:0] := F[2] - -
immintrin.h
-
- - SSE - Swizzle - - - - - - Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in "row0", "row1", "row2", and "row3", and store the transposed matrix in these vectors ("row0" now contains column 0, etc.). + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". -__m128 tmp3, tmp2, tmp1, tmp0; -tmp0 := _mm_unpacklo_ps(row0, row1); -tmp2 := _mm_unpacklo_ps(row2, row3); -tmp1 := _mm_unpackhi_ps(row0, row1); -tmp3 := _mm_unpackhi_ps(row2, row3); -row0 := _mm_movelh_ps(tmp0, tmp2); -row1 := _mm_movehl_ps(tmp2, tmp0); -row2 := _mm_movelh_ps(tmp1, tmp3); -row3 := _mm_movehl_ps(tmp3, tmp1); +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR -
xmmintrin.h
-
- - SSE - General Support - - - Get the unsigned 32-bit value of the MXCSR control and status register. - dst[31:0] := MXCSR + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR - -
xmmintrin.h
-
- - SSE - General Support - - - Set the MXCSR control and status register with the value in unsigned 32-bit integer "a". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". -MXCSR := a[31:0] +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR - -
xmmintrin.h
-
- - SSE - General Support - - Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT - dst[31:0] := MXCSR & _MM_EXCEPT_MASK + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". + +IF count[63:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] >> count[63:0]) +FI -
xmmintrin.h
-
- - SSE - General Support - - - Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT - MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". + +IF imm8[7:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] >> imm8[7:0]) +FI -
xmmintrin.h
-
- - SSE - General Support - - Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT - dst[31:0] := MXCSR & _MM_MASK_MASK + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI +ENDFOR -
xmmintrin.h
-
- - SSE - General Support - - - Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT - MXCSR := a[31:0] AND ~_MM_MASK_MASK + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI +ENDFOR -
xmmintrin.h
-
- - SSE - General Support - - Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO - dst[31:0] := MXCSR & _MM_ROUND_MASK + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI +ENDFOR -
xmmintrin.h
-
- - SSE - General Support - - - Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO - MXCSR := a[31:0] AND ~_MM_ROUND_MASK + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI +ENDFOR -
xmmintrin.h
-
- - SSE - General Support - - Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF - dst[31:0] := MXCSR & _MM_FLUSH_MASK + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". + +IF count[63:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] << count[63:0]) +FI -
xmmintrin.h
-
- - SSE - General Support - - - Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF - MXCSR := a[31:0] AND ~_MM_FLUSH_MASK + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". + +IF imm8[7:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] << imm8[7:0]) +FI -
xmmintrin.h
-
- - SSE - General Support - - - - Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i". - - - - -
xmmintrin.h
-
- - SSE - General Support - - - Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order. - -
xmmintrin.h
-
- - Integer - SSE - Special Math Functions - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 3 i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Special Math Functions - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". FOR j := 0 to 3 i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Special Math Functions - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Special Math Functions - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Special Math Functions - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". FOR j := 0 to 3 i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Special Math Functions - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". FOR j := 0 to 3 i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Special Math Functions - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Special Math Functions - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". + +IF count[63:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] >> count[63:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". + +IF imm8[7:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] >> imm8[7:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] AND b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". + +dst[63:0] := ((NOT a[63:0]) AND b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] OR b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] XOR b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] AND b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". + +dst[63:0] := ((NOT a[63:0]) AND b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] OR b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] XOR b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". FOR j := 0 to 7 i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Arithmetic - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". FOR j := 0 to 3 i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] + dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Arithmetic - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Probability/Statistics - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". FOR j := 0 to 7 i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Probability/Statistics - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". FOR j := 0 to 7 i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Probability/Statistics - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". FOR j := 0 to 3 i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Probability/Statistics - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Arithmetic - Miscellaneous - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". FOR j := 0 to 7 i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) + dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 ENDFOR -dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] -dst[63:16] := 0 - -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Arithmetic - Miscellaneous - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". -FOR j := 0 to 7 - i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 ENDFOR -dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] -dst[63:16] := 0 - -
xmmintrin.h
-
- - Floating Point - SSE - Convert - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Convert - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + + MMX +
mmintrin.h
+ Compare +
+ + + + Return vector of type __m64 with all elements set to zero. -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] +dst[MAX:0] := 0 - -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Convert - - - - Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + + MMX +
mmintrin.h
+ Set +
+ + + + + Set packed 32-bit integers in "dst" with the supplied values. -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 +dst[31:0] := e0 +dst[63:32] := e1 - -
xmmintrin.h
-
- - Floating Point - SSE - Convert - - - - Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". + MMX +
mmintrin.h
+ Set +
+ + + + + + + Set packed 16-bit integers in "dst" with the supplied values. -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[63:32] := Convert_Int32_To_FP32(b[63:32]) -dst[95:64] := a[95:64] -dst[127:96] := a[127:96] +dst[15:0] := e0 +dst[31:16] := e1 +dst[47:32] := e2 +dst[63:48] := e3 - -
xmmintrin.h
-
- - Integer - SSE - Convert - - - - Convert packed signed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". + MMX +
mmintrin.h
+ Set +
+ + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values. + +dst[7:0] := e0 +dst[15:8] := e1 +dst[23:16] := e2 +dst[31:24] := e3 +dst[39:32] := e4 +dst[47:40] := e5 +dst[55:48] := e6 +dst[63:56] := e7 + + MMX +
mmintrin.h
+ Set +
+ + + + Broadcast 32-bit integer "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR + + MMX +
mmintrin.h
+ Set +
+ + + + Broadcast 16-bit integer "a" to all all elements of "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR + + MMX +
mmintrin.h
+ Set +
+ + + + Broadcast 8-bit integer "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[7:0] +ENDFOR + + MMX +
mmintrin.h
+ Set +
+ + + + + Set packed 32-bit integers in "dst" with the supplied values in reverse order. -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[63:32] := Convert_Int32_To_FP32(b[63:32]) -dst[95:64] := a[95:64] -dst[127:96] := a[127:96] +dst[31:0] := e1 +dst[63:32] := e0 - -
xmmintrin.h
-
- - Floating Point - SSE - Convert - - - Convert packed 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + MMX +
mmintrin.h
+ Set +
+ + + + + + + Set packed 16-bit integers in "dst" with the supplied values in reverse order. -FOR j := 0 to 3 - i := j*16 - m := j*32 - dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) -ENDFOR +dst[15:0] := e3 +dst[31:16] := e2 +dst[47:32] := e1 +dst[63:48] := e0 -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Convert - - - Convert packed unsigned 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + MMX +
mmintrin.h
+ Set +
+ + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values in reverse order. -FOR j := 0 to 3 - i := j*16 - m := j*32 - dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) -ENDFOR +dst[7:0] := e7 +dst[15:8] := e6 +dst[23:16] := e5 +dst[31:24] := e4 +dst[39:32] := e3 +dst[47:40] := e2 +dst[55:48] := e1 +dst[63:56] := e0 -
xmmintrin.h
-
- - Floating Point - SSE - Convert - - - Convert the lower packed 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + MMX +
mmintrin.h
+ Set +
+ + + + + + + + Arm address monitoring hardware using the address specified in "p". A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in "extensions", and optional hints in "hints". + + MONITOR +
pmmintrin.h
+ General Support +
+ + + + + Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR. + + MONITOR +
pmmintrin.h
+ General Support +
+ + + + + + Load 16 bits from memory, perform a byte swap operation, and store the result in "dst". -FOR j := 0 to 3 +FOR j := 0 to 1 i := j*8 - m := j*32 - dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) + dst[i+7:i] := MEM[ptr+15-i:ptr+8-i] ENDFOR -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Convert - - - Convert the lower packed unsigned 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + MOVBE +
immintrin.h
+ Load +
+ + + + Load 32 bits from memory, perform a byte swap operation, and store the result in "dst". FOR j := 0 to 3 i := j*8 - m := j*32 - dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) + dst[i+7:i] := MEM[ptr+31-i:ptr+24-i] ENDFOR -
xmmintrin.h
-
- - Floating Point - SSE - Convert - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", then covert the packed signed 32-bit integers in "b" to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of "dst". + + MOVBE +
immintrin.h
+ Load +
+ + + + Load 64 bits from memory, perform a byte swap operation, and store the result in "dst". -dst[31:0] := Convert_Int32_To_FP32(a[31:0]) -dst[63:32] := Convert_Int32_To_FP32(a[63:32]) -dst[95:64] := Convert_Int32_To_FP32(b[31:0]) -dst[127:96] := Convert_Int32_To_FP32(b[63:32]) +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MEM[ptr+63-i:ptr+56-i] +ENDFOR -
xmmintrin.h
-
- - Integer - SSE - Store + + MOVBE +
immintrin.h
+ Load +
+ - - - Store 64-bits of integer data from "a" into memory using a non-temporal memory hint. + + + Perform a bit swap operation of the 16 bits in "data", and store the results to memory. -MEM[mem_addr+63:mem_addr] := a[63:0] +FOR j := 0 to 1 + i := j*8 + MEM[ptr+i+7:ptr+i] := data[15-i:8-i] +ENDFOR - -
xmmintrin.h
-
- - Integer - SSE + + MOVBE +
immintrin.h
Store +
+ - - - - Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. + + + Perform a bit swap operation of the 32 bits in "data", and store the results to memory. -FOR j := 0 to 7 +addr := MEM[ptr] +FOR j := 0 to 3 i := j*8 - IF mask[i+7] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI + MEM[ptr+i+7:ptr+i] := data[31-i:24-i] ENDFOR - -
xmmintrin.h
-
- - Integer - SSE + + MOVBE +
immintrin.h
Store +
+ - - - - Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + + + Perform a bit swap operation of the 64 bits in "data", and store the results to memory. +addr := MEM[ptr] FOR j := 0 to 7 i := j*8 - IF mask[i+7] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI + MEM[ptr+i+7:ptr+i] := data[63-i:56-i] ENDFOR - -
xmmintrin.h
-
- - Integer - SSE - Swizzle - - - - Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". + + MOVBE +
immintrin.h
+ Store +
+ + + + + + + Move 64-byte (512-bit) value using direct store from source memory address "src" to destination memory address "dst". -dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] -dst[31:16] := 0 +MEM[dst+511:dst] := MEM[src+511:src] - -
xmmintrin.h
-
- - Integer - SSE - Swizzle - - - - Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". + + MOVDIR64B +
immintrin.h
+ Store +
+ + + + + + + Store 64-bit integer from "val" into memory using direct store. -dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] -dst[31:16] := 0 +MEM[dst+63:dst] := val[63:0] - -
xmmintrin.h
-
- - Integer - SSE - Swizzle - - - - - Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". + + MOVDIRI +
immintrin.h
+ Store +
+ + + + + Store 32-bit integer from "val" into memory using direct store. -dst[63:0] := a[63:0] -sel := imm8[1:0]*16 -dst[sel+15:sel] := i[15:0] +MEM[dst+31:dst] := val[31:0] - -
xmmintrin.h
-
- - Integer - SSE - Swizzle - - - - - Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". - -dst[63:0] := a[63:0] -sel := imm8[1:0]*16 -dst[sel+15:sel] := i[15:0] + + MOVDIRI +
immintrin.h
+ Store +
+ + + + + + + Make a pointer with the value of "srcmem" and bounds set to ["srcmem", "srcmem" + "size" - 1], and store the result in "dst". + dst := srcmem +dst.LB := srcmem.LB +dst.UB := srcmem + size - 1 - -
xmmintrin.h
-
- - Integer - SSE + + MPX +
immintrin.h
Miscellaneous - - - Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[j] := a[i+7] -ENDFOR -dst[MAX:8] := 0 +
+ + + + + + Narrow the bounds for pointer "q" to the intersection of the bounds of "r" and the bounds ["q", "q" + "size" - 1], and store the result in "dst". + dst := q +IF r.LB > (q + size - 1) OR r.UB < q + dst.LB := 1 + dst.UB := 0 +ELSE + dst.LB := MAX(r.LB, q) + dst.UB := MIN(r.UB, (q + size - 1)) +FI - -
xmmintrin.h
-
- - Integer - SSE + MPX +
immintrin.h
Miscellaneous - - - Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[j] := a[i+7] -ENDFOR -dst[MAX:8] := 0 - - -
xmmintrin.h
-
- - Integer - SSE - Swizzle - - - - Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[15:0] := src[15:0] - 1: tmp[15:0] := src[31:16] - 2: tmp[15:0] := src[47:32] - 3: tmp[15:0] := src[63:48] - ESAC - RETURN tmp[15:0] -} -dst[15:0] := SELECT4(a[63:0], imm8[1:0]) -dst[31:16] := SELECT4(a[63:0], imm8[3:2]) -dst[47:32] := SELECT4(a[63:0], imm8[5:4]) -dst[63:48] := SELECT4(a[63:0], imm8[7:6]) - - -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Swizzle - - - - Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[15:0] := src[15:0] - 1: tmp[15:0] := src[31:16] - 2: tmp[15:0] := src[47:32] - 3: tmp[15:0] := src[63:48] - ESAC - RETURN tmp[15:0] -} -dst[15:0] := SELECT4(a[63:0], imm8[1:0]) -dst[31:16] := SELECT4(a[63:0], imm8[3:2]) -dst[47:32] := SELECT4(a[63:0], imm8[5:4]) -dst[63:48] := SELECT4(a[63:0], imm8[7:6]) + + + + + + Make a pointer with the value of "q" and bounds set to the bounds of "r" (e.g. copy the bounds of "r" to pointer "q"), and store the result in "dst". + dst := q +dst.LB := r.LB +dst.UB := r.UB - -
xmmintrin.h
-
- - Floating Point - SSE - Arithmetic - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := a[31:0] + b[31:0] -dst[127:32] := a[127:32] + MPX +
immintrin.h
+ Miscellaneous +
+ + + + Make a pointer with the value of "q" and open bounds, which allow the pointer to access the entire virtual address space, and store the result in "dst". + dst := q +dst.LB := 0 +dst.UB := 0 - -
xmmintrin.h
-
- - Floating Point - SSE - Arithmetic - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR + MPX +
immintrin.h
+ Miscellaneous +
+ + + + + Stores the bounds of "ptr_val" pointer in memory at address "ptr_addr". + MEM[ptr_addr].LB := ptr_val.LB +MEM[ptr_addr].UB := ptr_val.UB - -
xmmintrin.h
-
- - Floating Point - SSE - Arithmetic - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := a[31:0] - b[31:0] -dst[127:32] := a[127:32] + + MPX +
immintrin.h
+ Miscellaneous +
+ + + + Checks if "q" is within its lower bound, and throws a #BR if not. + IF q < q.LB + #BR +FI - -
xmmintrin.h
-
- - Floating Point - SSE - Arithmetic - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR + + MPX +
immintrin.h
+ Miscellaneous +
+ + + + Checks if "q" is within its upper bound, and throws a #BR if not. + IF q > q.UB + #BR +FI - -
xmmintrin.h
-
- - Floating Point - SSE - Arithmetic - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := a[31:0] * b[31:0] -dst[127:32] := a[127:32] + + + MPX +
immintrin.h
+ Miscellaneous +
+ + + + + Checks if ["q", "q" + "size" - 1] is within the lower and upper bounds of "q" and throws a #BR if not. + IF (q + size - 1) < q.LB OR (q + size - 1) > q.UB + #BR +FI - -
xmmintrin.h
-
- - Floating Point - SSE - Arithmetic - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] * b[i+31:i] -ENDFOR + + + MPX +
immintrin.h
+ Miscellaneous +
+ + + + Return the lower bound of "q". + dst := q.LB - -
xmmintrin.h
-
- - Floating Point - SSE - Arithmetic - - - - Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := a[31:0] / b[31:0] -dst[127:32] := a[127:32] + MPX +
immintrin.h
+ Miscellaneous +
+ + + + Return the upper bound of "q". + dst := q.UB - -
xmmintrin.h
-
- - Floating Point - SSE - Arithmetic - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + MPX +
immintrin.h
+ Miscellaneous +
+ + + + + + Set "dst" to the index of the lowest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined. -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := a[i+31:i] / b[i+31:i] -ENDFOR +tmp := 0 +IF a == 0 + // dst is undefined +ELSE + DO WHILE ((tmp < 32) AND a[tmp] == 0) + tmp := tmp + 1 + OD +FI +dst := tmp - -
xmmintrin.h
-
- - Floating Point - SSE - Elementary Math Functions - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + Set "dst" to the index of the highest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined. -dst[31:0] := SQRT(a[31:0]) -dst[127:32] := a[127:32] +tmp := 31 +IF a == 0 + // dst is undefined +ELSE + DO WHILE ((tmp > 0) AND a[tmp] == 0) + tmp := tmp - 1 + OD +FI +dst := tmp - -
xmmintrin.h
-
- - Floating Point - SSE - Elementary Math Functions - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + + Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) -ENDFOR +tmp := 0 +IF a == 0 + // MEM[index+31:index] is undefined + dst := 0 +ELSE + DO WHILE ((tmp < 32) AND a[tmp] == 0) + tmp := tmp + 1 + OD + MEM[index+31:index] := tmp + dst := (tmp == 31) ? 0 : 1 +FI - -
xmmintrin.h
-
- - Floating Point - SSE - Elementary Math Functions - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +
immintrin.h
+ Bit Manipulation +
+ + + + + Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. -dst[31:0] := (1.0 / a[31:0]) -dst[127:32] := a[127:32] +tmp := 31 +IF a == 0 + // MEM[index+31:index] is undefined + dst := 0 +ELSE + DO WHILE ((tmp > 0) AND a[tmp] == 0) + tmp := tmp - 1 + OD + MEM[index+31:index] := tmp + dst := (tmp == 0) ? 0 : 1 +FI - -
xmmintrin.h
-
- - Floating Point - SSE - Elementary Math Functions - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +
immintrin.h
+ Bit Manipulation +
+ + + + + Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) -ENDFOR +tmp := 0 +IF a == 0 + // MEM[index+31:index] is undefined + dst := 0 +ELSE + DO WHILE ((tmp < 64) AND a[tmp] == 0) + tmp := tmp + 1 + OD + MEM[index+31:index] := tmp + dst := (tmp == 63) ? 0 : 1 +FI - -
xmmintrin.h
-
- - Floating Point - SSE - Elementary Math Functions - - - Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +
immintrin.h
+ Bit Manipulation +
+ + + + + Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. -dst[31:0] := (1.0 / SQRT(a[31:0])) -dst[127:32] := a[127:32] +tmp := 63 +IF a == 0 + // MEM[index+31:index] is undefined + dst := 0 +ELSE + DO WHILE ((tmp > 0) AND a[tmp] == 0) + tmp := tmp - 1 + OD + MEM[index+31:index] := tmp + dst := (tmp == 0) ? 0 : 1 +FI - -
xmmintrin.h
-
- - Floating Point - SSE - Elementary Math Functions - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 32-bit integer "a". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) -ENDFOR +addr := a + ZeroExtend64(b) +dst[0] := MEM[addr] - -
xmmintrin.h
-
- - Floating Point - SSE - Special Math Functions - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 32-bit integer "a", and set that bit to its complement. -dst[31:0] := MIN(a[31:0], b[31:0]) -dst[127:32] := a[127:32] +addr := a + ZeroExtend64(b) +dst[0] := MEM[addr] +MEM[addr] := ~dst[0] - -
xmmintrin.h
-
- - Floating Point - SSE - Special Math Functions - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 32-bit integer "a", and set that bit to zero. -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR +addr := a + ZeroExtend64(b) +dst[0] := MEM[addr] +MEM[addr] := 0 - -
xmmintrin.h
-
- - Floating Point - SSE - Special Math Functions - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 32-bit integer "a", and set that bit to one. -dst[31:0] := MAX(a[31:0], b[31:0]) -dst[127:32] := a[127:32] +addr := a + ZeroExtend64(b) +dst[0] := MEM[addr] +MEM[addr] := 1 - -
xmmintrin.h
-
- - Floating Point - SSE - Special Math Functions - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 64-bit integer "a". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR +addr := a + b +dst[0] := MEM[addr] - -
xmmintrin.h
-
- - Floating Point - SSE - Logical - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 64-bit integer "a", and set that bit to its complement. -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) -ENDFOR +addr := a + b +dst[0] := MEM[addr] +MEM[addr] := ~dst[0] - -
xmmintrin.h
-
- - Floating Point - SSE - Logical - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 64-bit integer "a", and set that bit to zero. -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) -ENDFOR +addr := a + b +dst[0] := MEM[addr] +MEM[addr] := 0 - -
xmmintrin.h
-
- - Floating Point - SSE - Logical - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 64-bit integer "a", and set that bit to one. -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] -ENDFOR +addr := a + b +dst[0] := MEM[addr] +MEM[addr] := 1 - -
xmmintrin.h
-
- - Floating Point - SSE - Logical - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + Reverse the byte order of 32-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values. -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] -ENDFOR +dst[7:0] := a[31:24] +dst[15:8] := a[23:16] +dst[23:16] := a[15:8] +dst[31:24] := a[7:0] - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +
immintrin.h
+ Bit Manipulation +
+ + + + Reverse the byte order of 64-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values. -dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +dst[7:0] := a[63:56] +dst[15:8] := a[55:48] +dst[23:16] := a[47:40] +dst[31:24] := a[39:32] +dst[39:32] := a[31:24] +dst[47:40] := a[23:16] +dst[55:48] := a[15:8] +dst[63:56] := a[7:0] - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR + +
immintrin.h
+ Bit Manipulation +
+ + + + Cast from type float to type unsigned __int32 without conversion. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +
immintrin.h
+ Cast +
+ + + + Cast from type double to type unsigned __int64 without conversion. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +
immintrin.h
+ Cast +
+ + + + Cast from type unsigned __int32 to type float without conversion. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +
immintrin.h
+ Cast +
+ + + + Cast from type unsigned __int64 to type double without conversion. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +
immintrin.h
+ Cast +
+ + + + + Shift the bits of unsigned long integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". + // size := 32 or 64 +dst := a +count := shift AND (size - 1) +DO WHILE (count > 0) + tmp[0] := dst[size - 1] + dst := (dst << 1) OR tmp[0] + count := count - 1 +OD - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned long integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". + // size := 32 or 64 +dst := a +count := shift AND (size - 1) +DO WHILE (count > 0) + tmp[size - 1] := dst[0] + dst := (dst >> 1) OR tmp[size - 1] + count := count - 1 +OD + + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 32-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". -dst[31:0] := ( a[31:0] < b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +dst := a +count := shift AND 31 +DO WHILE (count > 0) + tmp[0] := dst[31] + dst := (dst << 1) OR tmp[0] + count := count - 1 +OD - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 32-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR +dst := a +count := shift AND 31 +DO WHILE (count > 0) + tmp[31] := dst[0] + dst := (dst >> 1) OR tmp + count := count - 1 +OD - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 16-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". -dst[31:0] := ( a[31:0] <= b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +dst := a +count := shift AND 15 +DO WHILE (count > 0) + tmp[0] := dst[15] + dst := (dst << 1) OR tmp[0] + count := count - 1 +OD - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 16-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] <= b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR +dst := a +count := shift AND 15 +DO WHILE (count > 0) + tmp[15] := dst[0] + dst := (dst >> 1) OR tmp + count := count - 1 +OD - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 64-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". -dst[31:0] := ( a[31:0] > b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +dst := a +count := shift AND 63 +DO WHILE (count > 0) + tmp[0] := dst[63] + dst := (dst << 1) OR tmp[0] + count := count - 1 +OD - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 64-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR +dst := a +count := shift AND 63 +DO WHILE (count > 0) + tmp[63] := dst[0] + dst := (dst >> 1) OR tmp[63] + count := count - 1 +OD - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +
immintrin.h
+ Shift +
+ + + + Treat the processor-specific feature(s) specified in "a" as available. Multiple features may be OR'd together. See the valid feature flags below: -dst[31:0] := ( a[31:0] >= b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +_FEATURE_GENERIC_IA32 +_FEATURE_FPU +_FEATURE_CMOV +_FEATURE_MMX +_FEATURE_FXSAVE +_FEATURE_SSE +_FEATURE_SSE2 +_FEATURE_SSE3 +_FEATURE_SSSE3 +_FEATURE_SSE4_1 +_FEATURE_SSE4_2 +_FEATURE_MOVBE +_FEATURE_POPCNT +_FEATURE_PCLMULQDQ +_FEATURE_AES +_FEATURE_F16C +_FEATURE_AVX +_FEATURE_RDRND +_FEATURE_FMA +_FEATURE_BMI +_FEATURE_LZCNT +_FEATURE_HLE +_FEATURE_RTM +_FEATURE_AVX2 +_FEATURE_KNCNI +_FEATURE_AVX512F +_FEATURE_ADX +_FEATURE_RDSEED +_FEATURE_AVX512ER +_FEATURE_AVX512PF +_FEATURE_AVX512CD +_FEATURE_SHA +_FEATURE_MPX +_FEATURE_AVX512BW +_FEATURE_AVX512VL +_FEATURE_AVX512VBMI +_FEATURE_AVX512_4FMAPS +_FEATURE_AVX512_4VNNIW +_FEATURE_AVX512_VPOPCNTDQ +_FEATURE_AVX512_BITALG +_FEATURE_AVX512_VBMI2 +_FEATURE_GFNI +_FEATURE_VAES +_FEATURE_VPCLMULQDQ +_FEATURE_AVX512_VNNI +_FEATURE_CLWB +_FEATURE_RDPID +_FEATURE_IBT +_FEATURE_SHSTK +_FEATURE_SGX +_FEATURE_WBNOINVD +_FEATURE_PCONFIG +_FEATURE_AXV512_4VNNIB +_FEATURE_AXV512_4FMAPH +_FEATURE_AXV512_BITALG2 +_FEATURE_AXV512_VP2INTERSECT - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". +
immintrin.h
+ General Support +
+ + + + Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This intrinsic does not check the processor vendor. See the valid feature flags below: -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] >= b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR +_FEATURE_GENERIC_IA32 +_FEATURE_FPU +_FEATURE_CMOV +_FEATURE_MMX +_FEATURE_FXSAVE +_FEATURE_SSE +_FEATURE_SSE2 +_FEATURE_SSE3 +_FEATURE_SSSE3 +_FEATURE_SSE4_1 +_FEATURE_SSE4_2 +_FEATURE_MOVBE +_FEATURE_POPCNT +_FEATURE_PCLMULQDQ +_FEATURE_AES +_FEATURE_F16C +_FEATURE_AVX +_FEATURE_RDRND +_FEATURE_FMA +_FEATURE_BMI +_FEATURE_LZCNT +_FEATURE_HLE +_FEATURE_RTM +_FEATURE_AVX2 +_FEATURE_KNCNI +_FEATURE_AVX512F +_FEATURE_ADX +_FEATURE_RDSEED +_FEATURE_AVX512ER +_FEATURE_AVX512PF +_FEATURE_AVX512CD +_FEATURE_SHA +_FEATURE_MPX +_FEATURE_AVX512BW +_FEATURE_AVX512VL +_FEATURE_AVX512VBMI +_FEATURE_AVX512_4FMAPS +_FEATURE_AVX512_4VNNIW +_FEATURE_AVX512_VPOPCNTDQ +_FEATURE_AVX512_BITALG +_FEATURE_AVX512_VBMI2 +_FEATURE_GFNI +_FEATURE_VAES +_FEATURE_VPCLMULQDQ +_FEATURE_AVX512_VNNI +_FEATURE_CLWB +_FEATURE_RDPID +_FEATURE_IBT +_FEATURE_SHSTK +_FEATURE_SGX +_FEATURE_WBNOINVD +_FEATURE_PCONFIG +_FEATURE_AXV512_4VNNIB +_FEATURE_AXV512_4FMAPH +_FEATURE_AXV512_BITALG2 +_FEATURE_AXV512_VP2INTERSECT - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
immintrin.h
+ General Support +
+ + + + Read the Performance Monitor Counter (PMC) specified by "a", and store up to 64-bits in "dst". The width of performance counters is implementation specific. + dst[63:0] := ReadPMC(a) + + +
immintrin.h
+ General Support +
+ + + + + + + Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). -dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +tmp[32:0] := a[31:0] + b[31:0] + (c_in > 0 ? 1 : 0) +MEM[out+31:out] := tmp[31:0] +dst[0] := tmp[32] +dst[7:1] := 0 - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". + +
immintrin.h
+ Arithmetic +
+ + + + + + + Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR +tmp[64:0] := a[63:0] + b[63:0] + (c_in > 0 ? 1 : 0) +MEM[out+63:out] := tmp[63:0] +dst[0] := tmp[64] +dst[7:1] := 0 - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +
immintrin.h
+ Arithmetic +
+ + + + + + + Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 32-bit integer "b", and subtract the result from unsigned 32-bit integer "a". Store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). -dst[31:0] := (!( a[31:0] < b[31:0] )) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +tmp[32:0] := a[31:0] - (b[31:0] + (c_in > 0 ? 1 : 0)) +MEM[out+31:out] := tmp[31:0] +dst[0] := tmp[32] +dst[7:1] := 0 - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". + +
immintrin.h
+ Arithmetic +
+ + + + + + + Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 64-bit integer "b", and subtract the result from unsigned 64-bit integer "a". Store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := !( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR +tmp[64:0] := a[63:0] - (b[63:0] + (c_in > 0 ? 1 : 0)) +MEM[out+63:out] := tmp[63:0] +dst[0] := tmp[64] +dst[7:1] := 0 - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +
immintrin.h
+ Arithmetic +
+ + + + Insert the 32-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled. + +
immintrin.h
+ Miscellaneous +
+ + + + Insert the 64-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled. + +
immintrin.h
+ Miscellaneous +
+ + + + + Invoke the Intel SGX enclave user (non-privilege) leaf function specified by "a", and return the error code. The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. + +
immintrin.h
+ Miscellaneous +
+ + + + + Invoke the Intel SGX enclave system (privileged) leaf function specified by "a", and return the error code. The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. + +
immintrin.h
+ Miscellaneous +
+ + + + + Invoke the Intel SGX enclave virtualized (VMM) leaf function specified by "a", and return the error code. The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. + +
immintrin.h
+ Miscellaneous +
+ + + + Write back and flush internal caches. + Initiate writing-back and flushing of external + caches. + +
immintrin.h
+ Miscellaneous +
+ + + + Convert the half-precision (16-bit) floating-point value "a" to a single-precision (32-bit) floating-point value, and store the result in "dst". -dst[31:0] := (!( a[31:0] <= b[31:0] )) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +dst[31:0] := Convert_FP16_To_FP32(a[15:0]) - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". +
emmintrin.h
+ Convert +
+ + + + + Convert the single-precision (32-bit) floating-point value "a" to a half-precision (16-bit) floating-point value, and store the result in "dst". + [round_note] -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (!( a[i+31:i] <= b[i+31:i] )) ? 0xFFFFFFFF : 0 -ENDFOR +dst[15:0] := Convert_FP32_To_FP16(a[31:0]) - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
emmintrin.h
+ Convert +
+ + + + + + + + Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst". -dst[31:0] := (!( a[31:0] > b[31:0] )) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +IF (imm8[0] == 0) + TEMP1 := a[63:0] +ELSE + TEMP1 := a[127:64] +FI +IF (imm8[4] == 0) + TEMP2 := b[63:0] +ELSE + TEMP2 := b[127:64] +FI +FOR i := 0 to 63 + TEMP[i] := (TEMP1[0] and TEMP2[i]) + FOR j := 1 to i + TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j]) + ENDFOR + dst[i] := TEMP[i] +ENDFOR +FOR i := 64 to 127 + TEMP[i] := 0 + FOR j := (i - 63) to 63 + TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j]) + ENDFOR + dst[i] := TEMP[i] +ENDFOR +dst[127] := 0 - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". + + PCLMULQDQ +
wmmintrin.h
+ Application-Targeted +
+ + + + + + + Invoke the PCONFIG leaf function specified by "a". The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to rbx, rcx, and rdx. May return the value in eax, depending on the semantics of the specified leaf function. + + PCONFIG +
immintrin.h
+ Miscellaneous +
+ + + + + + Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (!( a[i+31:i] > b[i+31:i] )) ? 0xFFFFFFFF : 0 +dst := 0 +FOR i := 0 to 31 + IF a[i] + dst := dst + 1 + FI ENDFOR - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + + POPCNT +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst". -dst[31:0] := (!( a[31:0] >= b[31:0] )) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] +dst := 0 +FOR i := 0 to 63 + IF a[i] + dst := dst + 1 + FI +ENDFOR - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". + + POPCNT +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of bits set to 1 in 32-bit integer "a", and return that count in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (!( a[i+31:i] >= b[i+31:i] )) ? 0xFFFFFFFF : 0 +dst := 0 +FOR i := 0 to 31 + IF a[i] + dst := dst + 1 + FI ENDFOR - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xFFFFFFFF : 0 + + POPCNT +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of bits set to 1 in 64-bit integer "a", and return that count in "dst". + +dst := 0 +FOR i := 0 to 63 + IF a[i] + dst := dst + 1 + FI ENDFOR - + + POPCNT +
immintrin.h
+ Bit Manipulation +
+ + + + + + + Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i". + + PREFETCHWT1
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - dst[31:0] := ( a[31:0] == NaN OR b[31:0] == NaN ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] + General Support + + + + + + + Copy the IA32_TSC_AUX MSR (signature value) into "dst". + dst[31:0] := IA32_TSC_AUX[31:0] - -
xmmintrin.h
-
- - Floating Point - SSE - Compare - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == NaN OR b[i+31:i] == NaN ) ? 0xFFFFFFFF : 0 -ENDFOR + + RDPID +
immintrin.h
+ General Support +
+ + + + + + Read a hardware generated 16-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_RND_GEN.ready == 1 + val[15:0] := HW_RND_GEN.data + dst := 1 +ELSE + val[15:0] := 0 + dst := 0 +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). - -RETURN ( a[31:0] == b[31:0] ) ? 1 : 0 + + RDRAND +
immintrin.h
+ Random +
+ + + + Read a hardware generated 32-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_RND_GEN.ready == 1 + val[31:0] := HW_RND_GEN.data + dst := 1 +ELSE + val[31:0] := 0 + dst := 0 +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). - -RETURN ( a[31:0] < b[31:0] ) ? 1 : 0 + + RDRAND +
immintrin.h
+ Random +
+ + + + Read a hardware generated 64-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_RND_GEN.ready == 1 + val[63:0] := HW_RND_GEN.data + dst := 1 +ELSE + val[63:0] := 0 + dst := 0 +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). - -RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0 + + RDRAND +
immintrin.h
+ Random +
+ + + + + + Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_NRND_GEN.ready == 1 + val[15:0] := HW_NRND_GEN.data + dst := 1 +ELSE + val[15:0] := 0 + dst := 0 +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). - -RETURN ( a[31:0] > b[31:0] ) ? 1 : 0 + + RDSEED +
immintrin.h
+ Random +
+ + + + Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_NRND_GEN.ready == 1 + val[31:0] := HW_NRND_GEN.data + dst := 1 +ELSE + val[31:0] := 0 + dst := 0 +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). - -RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0 + + RDSEED +
immintrin.h
+ Random +
+ + + + Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_NRND_GEN.ready == 1 + val[63:0] := HW_NRND_GEN.data + dst := 1 +ELSE + val[63:0] := 0 + dst := 0 +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). - -RETURN ( a[31:0] != b[31:0] ) ? 1 : 0 + + RDSEED +
immintrin.h
+ Random +
+ + + + + + Copy the current 64-bit value of the processor's time-stamp counter into "dst", and store the IA32_TSC_AUX MSR (signature value) into memory at "mem_addr". + dst[63:0] := TimeStampCounter +MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0] - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - -RETURN ( a[31:0] == b[31:0] ) ? 1 : 0 + + RDTSCP +
immintrin.h
+ General Support +
+ + + + + + Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the "imm8" parameter will be provided in bits [31:24] of EAX. + Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction. + IF RTM_ACTIVE == 0 + // nop +ELSE + // restore architectural register state + // discard memory updates performed in transaction + // update EAX with status and imm8 value + eax[31:24] := imm8[7:0] + RTM_NEST_COUNT := 0 + RTM_ACTIVE := 0 + IF _64_BIT_MODE + RIP := fallbackRIP + ELSE + EIP := fallbackEIP + FI +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - -RETURN ( a[31:0] < b[31:0] ) ? 1 : 0 + + RTM +
immintrin.h
+ General Support +
+ + + + Specify the start of an RTM code region. + If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution. + On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction. Return status of ~0 (0xFFFF) if continuing inside transaction; all other codes are aborts. + IF RTM_NEST_COUNT < MAX_RTM_NEST_COUNT + RTM_NEST_COUNT := RTM_NEST_COUNT + 1 + IF RTM_NEST_COUNT == 1 + IF _64_BIT_MODE + fallbackRIP := RIP + ELSE IF _32_BIT_MODE + fallbackEIP := EIP + FI + + RTM_ACTIVE := 1 + // enter RTM execution, record register state, start tracking memory state + FI +ELSE + // RTM abort (see _xabort) +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - -RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0 + + RTM +
immintrin.h
+ General Support +
+ + + + Specify the end of an RTM code region. + If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically. + If the commit fails, the logical processor will perform an RTM abort. + IF RTM_ACTIVE == 1 + RTM_NEST_COUNT := RTM_NEST_COUNT - 1 + IF RTM_NEST_COUNT == 0 + // try to commit transaction + IF FAIL_TO_COMMIT_TRANSACTION + // RTM abort (see _xabort) + ELSE + RTM_ACTIVE := 0 + FI + FI +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - -RETURN ( a[31:0] > b[31:0] ) ? 1 : 0 + + RTM +
immintrin.h
+ General Support +
+ + + + Query the transactional execution status, return 1 if inside a transactionally executing RTM or HLE region, and return 0 otherwise. + IF (RTM_ACTIVE == 1 OR HLE_ACTIVE == 1) + dst := 1 +ELSE + dst := 0 +FI - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + + RTM +
immintrin.h
+ General Support +
+ + + + + Serialize instruction execution, ensuring all modifications to flags, registers, and memory by previous instructions are completed before the next instruction is fetched. + + SERIALIZE +
immintrin.h
+ General Support +
+ + + + + + + Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst". -RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0 +W0 := a[127:96] +W1 := a[95:64] +W2 := a[63:32] +W3 := a[31:0] +W4 := b[127:96] +W5 := b[95:64] +dst[127:96] := W2 XOR W0 +dst[95:64] := W3 XOR W1 +dst[63:32] := W4 XOR W2 +dst[31:0] := W5 XOR W3 - -
xmmintrin.h
-
- - Floating Point - Flag - SSE - Compare - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + + SHA +
immintrin.h
+ Cryptography +
+ + + + + Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in "a" and the previous message values in "b", and store the result in "dst". -RETURN ( a[31:0] != b[31:0] ) ? 1 : 0 +W13 := b[95:64] +W14 := b[63:32] +W15 := b[31:0] +W16 := (a[127:96] XOR W13) <<< 1 +W17 := (a[95:64] XOR W14) <<< 1 +W18 := (a[63:32] XOR W15) <<< 1 +W19 := (a[31:0] XOR W16) <<< 1 +dst[127:96] := W16 +dst[95:64] := W17 +dst[63:32] := W18 +dst[31:0] := W19 - -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + + SHA +
immintrin.h
+ Cryptography +
+ + + + + Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable "a", add that value to the scheduled values (unsigned 32-bit integers) in "b", and store the result in "dst". -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) +tmp := (a[127:96] <<< 30) +dst[127:96] := b[127:96] + tmp +dst[95:64] := b[95:64] +dst[63:32] := b[63:32] +dst[31:0] := b[31:0] - -
xmmintrin.h
-
- - Floating Point - SSE - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) + + SHA +
immintrin.h
+ Cryptography +
+ + + + + + Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from "a" and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from "b", and store the updated SHA1 state (A,B,C,D) in "dst". "func" contains the logic functions and round constants. + IF (func[1:0] == 0) + f := f0() + K := K0 +ELSE IF (func[1:0] == 1) + f := f1() + K := K1 +ELSE IF (func[1:0] == 2) + f := f2() + K := K2 +ELSE IF (func[1:0] == 3) + f := f3() + K := K3 +FI +A := a[127:96] +B := a[95:64] +C := a[63:32] +D := a[31:0] +W[0] := b[127:96] +W[1] := b[95:64] +W[2] := b[63:32] +W[3] := b[31:0] +A[1] := f(B, C, D) + (A <<< 5) + W[0] + K +B[1] := A +C[1] := B <<< 30 +D[1] := C +E[1] := D +FOR i := 1 to 3 + A[i+1] := f(B[i], C[i], D[i]) + (A[i] <<< 5) + W[i] + E[i] + K + B[i+1] := A[i] + C[i+1] := B[i] <<< 30 + D[i+1] := C[i] + E[i+1] := D[i] +ENDFOR +dst[127:96] := A[4] +dst[95:64] := B[4] +dst[63:32] := C[4] +dst[31:0] := D[4] - -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_Int64(a[31:0]) + + SHA +
immintrin.h
+ Cryptography +
+ + + + + Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst". + W4 := b[31:0] +W3 := a[127:96] +W2 := a[95:64] +W1 := a[63:32] +W0 := a[31:0] +dst[127:96] := W3 + sigma0(W4) +dst[95:64] := W2 + sigma0(W3) +dst[63:32] := W1 + sigma0(W2) +dst[31:0] := W0 + sigma0(W1) - -
xmmintrin.h
-
- - Floating Point - SSE - Convert - - - Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". - -dst[31:0] := a[31:0] + + SHA +
immintrin.h
+ Cryptography +
+ + + + + Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst"." + W14 := b[95:64] +W15 := b[127:96] +W16 := a[31:0] + sigma1(W14) +W17 := a[63:32] + sigma1(W15) +W18 := a[95:64] + sigma1(W16) +W19 := a[127:96] + sigma1(W17) +dst[127:96] := W19 +dst[95:64] := W18 +dst[63:32] := W17 +dst[31:0] := W16 - -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + + SHA +
immintrin.h
+ Cryptography +
+ + + + + + Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from "a", an initial SHA256 state (A,B,E,F) from "b", and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from "k", and store the updated SHA256 state (A,B,E,F) in "dst". + A[0] := b[127:96] +B[0] := b[95:64] +C[0] := a[127:96] +D[0] := a[95:64] +E[0] := b[63:32] +F[0] := b[31:0] +G[0] := a[63:32] +H[0] := a[31:0] +W_K[0] := k[31:0] +W_K[1] := k[63:32] +FOR i := 0 to 1 + A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]) + B[i+1] := A[i] + C[i+1] := B[i] + D[i+1] := C[i] + E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i] + F[i+1] := E[i] + G[i+1] := F[i] + H[i+1] := G[i] ENDFOR +dst[127:96] := A[2] +dst[95:64] := B[2] +dst[63:32] := E[2] +dst[31:0] := F[2] - -
xmmintrin.h
-
- - Floating Point - SSE - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + + SHA +
immintrin.h
+ Cryptography +
+ + + + + + Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 1 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + i := j*64 + dst[i+63:i] := ACOS(a[i+63:i]) ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) - - -
xmmintrin.h
-
- - Floating Point - SSE - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) - - -
xmmintrin.h
-
- - Floating Point - Integer - SSE - Convert - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) - - -
xmmintrin.h
-
- - Floating Point - Integer SSE - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -FOR j := 0 to 1 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ACOS(a[i+31:i]) ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 1 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + i := j*64 + dst[i+63:i] := ACOSH(a[i+63:i]) ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point - Integer SSE - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 - i := 16*j - k := 32*j - IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF) - dst[i+15:i] := 0x7FFF - ELSE - dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k]) - FI + i := j*32 + dst[i+31:i] := ACOSH(a[i+31:i]) ENDFOR +dst[MAX:128] := 0 -
xmmintrin.h
-
- - Floating Point - Integer SSE - Convert - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 8-bit integers, and store the results in lower 4 elements of "dst". Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -FOR j := 0 to 3 - i := 8*j - k := 32*j - IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF) - dst[i+7:i] := 0x7F - ELSE - dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k]) - FI +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ASIN(a[i+63:i]) ENDFOR +dst[MAX:128] := 0 -
xmmintrin.h
-
- - Floating Point - SSE - Set - - - Copy single-precision (32-bit) floating-point element "a" to the lower element of "dst", and zero the upper 3 elements. - -dst[31:0] := a[31:0] -dst[127:32] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Set - - - Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := a[31:0] + dst[i+31:i] := ASIN(a[i+31:i]) ENDFOR +dst[MAX:128] := 0 -
xmmintrin.h
-
- - Floating Point SSE - Set - - - Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[31:0] +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ASINH(a[i+63:i]) ENDFOR +dst[MAX:128] := 0 -
xmmintrin.h
-
- - Floating Point - SSE - Set - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. - -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 - -
xmmintrin.h
-
- - Floating Point - SSE - Set - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst[31:0] := e3 -dst[63:32] := e2 -dst[95:64] := e1 -dst[127:96] := e0 - -
xmmintrin.h
-
- - Floating Point - SSE - Set - - - Return vector of type __m128 with all elements set to zero. - -dst[MAX:0] := 0 - - -
xmmintrin.h
-
- - Integer - SSE - Load - - - - Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - -dst[31:0] := a[31:0] -dst[63:32] := a[63:32] -dst[95:64] := MEM[mem_addr+31:mem_addr] -dst[127:96] := MEM[mem_addr+63:mem_addr+32] - - -
xmmintrin.h
-
- - Integer - SSE - Load - - - - Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[63:32] := MEM[mem_addr+63:mem_addr+32] -dst[95:64] := a[95:64] -dst[127:96] := a[127:96] - - -
xmmintrin.h
-
- - Floating Point - SSE - Load - - - Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary. - -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[127:32] := 0 - - -
xmmintrin.h
-
- - Floating Point - SSE - Load - - - Load a single-precision (32-bit) floating-point element from memory into all elements of "dst". - -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[63:32] := MEM[mem_addr+31:mem_addr] -dst[95:64] := MEM[mem_addr+31:mem_addr] -dst[127:96] := MEM[mem_addr+31:mem_addr] - -
xmmintrin.h
-
- - Floating Point - SSE - Load - - - Load a single-precision (32-bit) floating-point element from memory into all elements of "dst". - -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[63:32] := MEM[mem_addr+31:mem_addr] -dst[95:64] := MEM[mem_addr+31:mem_addr] -dst[127:96] := MEM[mem_addr+31:mem_addr] - -
xmmintrin.h
-
- - Floating Point - SSE - Load - - - Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - -
xmmintrin.h
-
- - Floating Point - SSE - Load - - - Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - -
xmmintrin.h
-
- - Floating Point - SSE - Load - - - Load 4 single-precision (32-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[31:0] := MEM[mem_addr+127:mem_addr+96] -dst[63:32] := MEM[mem_addr+95:mem_addr+64] -dst[95:64] := MEM[mem_addr+63:mem_addr+32] -dst[127:96] := MEM[mem_addr+31:mem_addr] - -
xmmintrin.h
-
- - Floating Point - SSE - Store - - - - Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - -
xmmintrin.h
-
- - Integer - SSE - Store - - - - Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory. - -MEM[mem_addr+31:mem_addr] := a[95:64] -MEM[mem_addr+63:mem_addr+32] := a[127:96] - - -
xmmintrin.h
-
- - Integer - SSE - Store - - - - Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory. - -MEM[mem_addr+31:mem_addr] := a[31:0] -MEM[mem_addr+63:mem_addr+32] := a[63:32] - - -
xmmintrin.h
-
- - Floating Point - SSE - Store - - - - Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+31:mem_addr] := a[31:0] - - -
xmmintrin.h
-
- - Floating Point - SSE - Store - - - - Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+31:mem_addr] := a[31:0] -MEM[mem_addr+63:mem_addr+32] := a[31:0] -MEM[mem_addr+95:mem_addr+64] := a[31:0] -MEM[mem_addr+127:mem_addr+96] := a[31:0] - -
xmmintrin.h
-
- - Floating Point - SSE - Store - - - - Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+31:mem_addr] := a[31:0] -MEM[mem_addr+63:mem_addr+32] := a[31:0] -MEM[mem_addr+95:mem_addr+64] := a[31:0] -MEM[mem_addr+127:mem_addr+96] := a[31:0] - -
xmmintrin.h
-
- - Floating Point - SSE - Store - - - - Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - -
xmmintrin.h
-
- - Floating Point SSE - Store - - - - Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -MEM[mem_addr+127:mem_addr] := a[127:0] +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ASINH(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Store - - - - Store 4 single-precision (32-bit) floating-point elements from "a" into memory in reverse order. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -MEM[mem_addr+31:mem_addr] := a[127:96] -MEM[mem_addr+63:mem_addr+32] := a[95:64] -MEM[mem_addr+95:mem_addr+64] := a[63:32] -MEM[mem_addr+127:mem_addr+96] := a[31:0] +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ATAN(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Move - - - - Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -dst[31:0] := b[31:0] -dst[127:32] := a[127:32] +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ATAN(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Swizzle - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Swizzle - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Swizzle - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ATANH(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Move - - - - Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -dst[31:0] := b[95:64] -dst[63:32] := b[127:96] -dst[95:64] := a[95:64] -dst[127:96] := a[127:96] +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ATANH(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Move - - - - Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". -dst[31:0] := a[31:0] -dst[63:32] := a[63:32] -dst[95:64] := b[31:0] -dst[127:96] := b[63:32] +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := COS(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - Floating Point SSE - Miscellaneous - - - Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*32 - IF a[i+31] - dst[j] := 1 - ELSE - dst[j] := 0 - FI + dst[i+31:i] := COS(a[i+31:i]) ENDFOR -dst[MAX:4] := 0 +dst[MAX:128] := 0 - -
xmmintrin.h
-
- - SSE - General Support - - - - Allocate "size" bytes of memory, aligned to the alignment specified in "align", and return a pointer to the allocated memory. "_mm_free" should be used to free memory that is allocated with "_mm_malloc". -
xmmintrin.h
-
- SSE - General Support - - - Free aligned memory that was allocated with "_mm_malloc". -
xmmintrin.h
-
- - Floating Point +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := COSD(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - General Support - - - Return vector of type __m128 with undefined elements.
immintrin.h
-
- - Floating Point + Trigonometry + + + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := COSD(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + SSE +
immintrin.h
Trigonometry - - - Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := ACOS(a[i+63:i]) + dst[i+63:i] := COSH(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := ACOS(a[i+31:i]) + dst[i+31:i] := COSH(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := ACOSH(a[i+63:i]) + dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := ACOSH(a[i+31:i]) + dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := ASIN(a[i+63:i]) + dst[i+63:i] := SIN(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := ASIN(a[i+31:i]) + dst[i+31:i] := SIN(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + + Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := ASINH(a[i+63:i]) + dst[i+63:i] := SIN(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + + Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := ASINH(a[i+31:i]) + dst[i+31:i] := SIN(a[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 1 i := j*64 - dst[i+63:i] := ATAN(a[i+63:i]) + dst[i+63:i] := SIND(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 i := j*32 - dst[i+31:i] := ATAN(a[i+31:i]) + dst[i+31:i] := SIND(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. +
+ + + + Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) + dst[i+63:i] := SINH(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. +
+ + + + Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) + dst[i+31:i] := SINH(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := ATANH(a[i+63:i]) + dst[i+63:i] := TAN(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Trigonometry - - - Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := ATANH(a[i+31:i]) + dst[i+31:i] := TAN(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Elementary Math Functions - - - Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := CubeRoot(a[i+63:i]) + dst[i+63:i] := TAND(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Elementary Math Functions - - - Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := CubeRoot(a[i+31:i]) + dst[i+31:i] := TAND(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Probability/Statistics - - - Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 1 +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 i := j*64 - dst[i+63:i] := CDFNormal(a[i+63:i]) + dst[i+63:i] := TANH(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Probability/Statistics - - - Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 3 +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 i := j*32 - dst[i+31:i] := CDFNormal(a[i+31:i]) + dst[i+31:i] := TANH(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Probability/Statistics - - - Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". +
immintrin.h
+ Trigonometry +
+ + + + Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := InverseCDFNormal(a[i+63:i]) + dst[i+63:i] := CubeRoot(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Probability/Statistics - - - Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := InverseCDFNormal(a[i+31:i]) + dst[i+31:i] := CubeRoot(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Elementary Math Functions - - +
+ + + Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". DEFINE CEXP(a[31:0], b[31:0]) { @@ -138732,14 +158260,13 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Elementary Math Functions - - +
+ + + Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". DEFINE CLOG(a[31:0], b[31:0]) { @@ -138753,131 +158280,575 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CSQRT(a[31:0], b[31:0]) { + sign[31:0] := (b < 0.0) ? -FP32(1.0) : FP32(1.0) + result[31:0] := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) + result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) + RETURN result +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32]) +ENDFOR +dst[MAX:128] := 0 + SSE - Trigonometry - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := COS(a[i+63:i]) + dst[i+63:i] := POW(e, a[i+63:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Trigonometry - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := POW(10.0, a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := COS(a[i+31:i]) + dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := POW(2.0, a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Trigonometry - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := COSD(a[i+63:i]) + dst[i+63:i] := InvCubeRoot(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := InvCubeRoot(a[i+31:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := InvSQRT(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Trigonometry - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := COSD(a[i+31:i]) + dst[i+31:i] := InvSQRT(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Trigonometry - - - Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := COSH(a[i+63:i]) + dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) +ENDFOR +dst[MAX:128] := 0 + SSE - Trigonometry - - - Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LOG(1.0 + a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := COSH(a[i+31:i]) + dst[i+31:i] := LOG(1.0 + a[i+31:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) +ENDFOR +dst[MAX:128] := 0 + SSE +
immintrin.h
Elementary Math Functions - - - Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". +
+ + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". -DEFINE CSQRT(a[31:0], b[31:0]) { - sign[31:0] := (b < 0.0) ? -FP32(1.0) : FP32(1.0) - result[31:0] := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) - result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) - RETURN result -} FOR j := 0 to 1 i := j*64 - dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32]) + dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := CDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := InverseCDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := InverseCDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ERF(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := 1.0 - ERF(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+63:i] := 1.0 - ERF(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := 1.0 / ERF(a[i+63:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Integer + Probability/Statistics + + + + + Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+63:i] := 1.0 / ERF(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Arithmetic - - - +
immintrin.h
+ Probability/Statistics +
+ + + + Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". FOR j := 0 to 15 @@ -138889,15 +158860,14 @@ FOR j := 0 to 15 ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Integer SSE +
immintrin.h
Arithmetic - - - +
+ + + + Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". FOR j := 0 to 7 @@ -138909,15 +158879,14 @@ FOR j := 0 to 7 ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Integer SSE +
immintrin.h
Arithmetic - - - +
+ + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". FOR j := 0 to 3 @@ -138929,15 +158898,14 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Integer SSE +
immintrin.h
Arithmetic - - - +
+ + + + Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". FOR j := 0 to 1 @@ -138949,15 +158917,14 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Integer SSE +
immintrin.h
Arithmetic - - - +
+ + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". FOR j := 0 to 15 @@ -138969,15 +158936,14 @@ FOR j := 0 to 15 ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Integer SSE +
immintrin.h
Arithmetic - - - +
+ + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". FOR j := 0 to 7 @@ -138989,15 +158955,14 @@ FOR j := 0 to 7 ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Integer SSE +
immintrin.h
Arithmetic - - - +
+ + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". FOR j := 0 to 3 @@ -139009,15 +158974,14 @@ FOR j := 0 to 3 ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Integer SSE +
immintrin.h
Arithmetic - - - +
+ + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". FOR j := 0 to 1 @@ -139029,14 +158993,13 @@ FOR j := 0 to 1 ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE +
immintrin.h
Arithmetic - - +
+ + + Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 1 i := j*64 @@ -139044,2952 +159007,2870 @@ dst[MAX:128] := 0 ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Probability/Statistics - - - Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr". FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ERF(a[i+31:i]) + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Probability/Statistics - - - Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := 1.0 - ERF(a[i+63:i]) +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 15 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 7 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Probability/Statistics - - - Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". FOR j := 0 to 3 - i := j*32 - dst[i+63:i] := 1.0 - ERF(a[i+31:i]) + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Probability/Statistics - - - Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 15 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Probability/Statistics - - - Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 7 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Probability/Statistics - - - Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := 1.0 / ERF(a[i+63:i]) + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Probability/Statistics - - - Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr". FOR j := 0 to 3 - i := j*32 - dst[i+63:i] := 1.0 / ERF(a[i+31:i]) + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Arithmetic + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + SSE - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Arithmetic +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. FOR j := 0 to 1 i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) + dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. FOR j := 0 to 3 i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) + dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Elementary Math Functions - - - Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. FOR j := 0 to 1 i := j*64 - dst[i+63:i] := POW(10.0, a[i+63:i]) + dst[i+63:i] := FLOOR(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Elementary Math Functions - - - Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. FOR j := 0 to 3 i := j*32 - dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) + dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Elementary Math Functions - - - Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. FOR j := 0 to 1 i := j*64 - dst[i+63:i] := POW(2.0, a[i+63:i]) + dst[i+63:i] := ROUND(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Elementary Math Functions - - - Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. FOR j := 0 to 3 i := j*32 - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) + dst[i+31:i] := ROUND(a[i+31:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - -FOR j := 0 to 1 +
immintrin.h
+ Special Math Functions +
+ + + + Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + FOR j := 0 to 1 i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 + dst[i+63:i] := TRUNCATE(a[i+63:i]) ENDFOR dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Elementary Math Functions - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - -FOR j := 0 to 3 +
immintrin.h
+ Miscellaneous +
+ + + + Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + FOR j := 0 to 3 i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 + dst[i+31:i] := TRUNCATE(a[i+31:i]) ENDFOR dst[MAX:128] := 0 + SSE
immintrin.h
-
- - Floating Point + Miscellaneous + + + + + + + + + + Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in "row0", "row1", "row2", and "row3", and store the transposed matrix in these vectors ("row0" now contains column 0, etc.). + +__m128 tmp3, tmp2, tmp1, tmp0; +tmp0 := _mm_unpacklo_ps(row0, row1); +tmp2 := _mm_unpacklo_ps(row2, row3); +tmp1 := _mm_unpackhi_ps(row0, row1); +tmp3 := _mm_unpackhi_ps(row2, row3); +row0 := _mm_movelh_ps(tmp0, tmp2); +row1 := _mm_movehl_ps(tmp2, tmp0); +row2 := _mm_movelh_ps(tmp1, tmp3); +row3 := _mm_movehl_ps(tmp3, tmp1); + SSE - Trigonometry - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
xmmintrin.h
+ Swizzle +
+ + + + + Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) -ENDFOR -dst[MAX:128] := 0 +dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] +dst[31:16] := 0 -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". +
xmmintrin.h
+ Swizzle +
+ + + + + Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) -ENDFOR -dst[MAX:128] := 0 +dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] +dst[31:16] := 0 + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". + +dst[63:0] := a[63:0] +sel := imm8[1:0]*16 +dst[sel+15:sel] := i[15:0] + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". + +dst[63:0] := a[63:0] +sel := imm8[1:0]*16 +dst[sel+15:sel] := i[15:0] + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[15:0] := src[15:0] + 1: tmp[15:0] := src[31:16] + 2: tmp[15:0] := src[47:32] + 3: tmp[15:0] := src[63:48] + ESAC + RETURN tmp[15:0] +} +dst[15:0] := SELECT4(a[63:0], imm8[1:0]) +dst[31:16] := SELECT4(a[63:0], imm8[3:2]) +dst[47:32] := SELECT4(a[63:0], imm8[5:4]) +dst[63:48] := SELECT4(a[63:0], imm8[7:6]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[15:0] := src[15:0] + 1: tmp[15:0] := src[31:16] + 2: tmp[15:0] := src[47:32] + 3: tmp[15:0] := src[63:48] + ESAC + RETURN tmp[15:0] +} +dst[15:0] := SELECT4(a[63:0], imm8[1:0]) +dst[31:16] := SELECT4(a[63:0], imm8[3:2]) +dst[47:32] := SELECT4(a[63:0], imm8[5:4]) +dst[63:48] := SELECT4(a[63:0], imm8[7:6]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +dst[127:96] := SELECT4(b[127:0], imm8[7:6]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + Get the unsigned 32-bit value of the MXCSR control and status register. + dst[31:0] := MXCSR + + SSE
immintrin.h
-
- - Integer + General Support + + + + + Set the MXCSR control and status register with the value in unsigned 32-bit integer "a". + +MXCSR := a[31:0] + + SSE - Arithmetic - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +
immintrin.h
+ General Support +
+ + + Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT + dst[31:0] := MXCSR & _MM_EXCEPT_MASK + SSE
immintrin.h
-
- - Integer + General Support + + + + + Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT + MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK + SSE - Arithmetic - - - - - Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +
immintrin.h
+ General Support +
+ + + Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT + dst[31:0] := MXCSR & _MM_MASK_MASK + SSE
immintrin.h
-
- - Floating Point + General Support + + + + + Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT + MXCSR := a[31:0] AND ~_MM_MASK_MASK + SSE - Elementary Math Functions - - - Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := InvCubeRoot(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +
immintrin.h
+ General Support +
+ + + Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + dst[31:0] := MXCSR & _MM_ROUND_MASK + + SSE +
immintrin.h
+ General Support +
+ + + + Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + MXCSR := a[31:0] AND ~_MM_ROUND_MASK + SSE
immintrin.h
-
- - Floating Point + General Support + + + + Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF + dst[31:0] := MXCSR & _MM_FLUSH_MASK + SSE - Elementary Math Functions - - - Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := InvCubeRoot(a[i+31:i]) +
immintrin.h
+ General Support +
+ + + + Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF + MXCSR := a[31:0] AND ~_MM_FLUSH_MASK + + SSE +
immintrin.h
+ General Support +
+ + + + + Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i". + + + + + SSE +
immintrin.h
+ General Support +
+ + + + Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order. + + SSE +
immintrin.h
+ General Support +
+ + + + + Allocate "size" bytes of memory, aligned to the alignment specified in "align", and return a pointer to the allocated memory. "_mm_free" should be used to free memory that is allocated with "_mm_malloc". + SSE +
immintrin.h
+ General Support +
+ + + + Free aligned memory that was allocated with "_mm_malloc". + SSE +
immintrin.h
+ General Support +
+ + + + Return vector of type __m128 with undefined elements. + SSE +
immintrin.h
+ General Support +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := InvSQRT(a[i+63:i]) +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := InvSQRT(a[i+31:i]) +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + SSE - Arithmetic - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". [min_float_note] + +dst[31:0] := MIN(a[31:0], b[31:0]) +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] FOR j := 0 to 3 i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". [max_float_note] -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LOG(1.0 + a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := MAX(a[31:0], b[31:0]) +dst[127:32] := a[127:32] -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] FOR j := 0 to 3 i := j*32 - dst[i+31:i] := LOG(1.0 + a[i+31:i]) + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
xmmintrin.h
+ Special Math Functions +
+ + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". +
xmmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +
xmmintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst". + +FOR j := 0 to 7 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) ENDFOR -dst[MAX:128] := 0 +dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] +dst[63:16] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +
xmmintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst". + +FOR j := 0 to 7 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) ENDFOR -dst[MAX:128] := 0 +dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] +dst[63:16] := 0 -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - - Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". +
xmmintrin.h
+ Arithmetic +
+ + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := a[31:0] + b[31:0] +dst[127:32] := a[127:32] -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - - Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". +
xmmintrin.h
+ Arithmetic +
+ + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) + dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + SSE +
xmmintrin.h
Arithmetic - - - - Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 15 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:128] := 0 +
+ + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := a[31:0] - b[31:0] +dst[127:32] := a[127:32] -
immintrin.h
-
- - Integer + SSE +
xmmintrin.h
Arithmetic - - - - Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 7 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +
+ + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + SSE +
xmmintrin.h
Arithmetic - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +
+ + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := a[31:0] * b[31:0] +dst[127:32] := a[127:32] -
immintrin.h
-
- - Integer + SSE +
xmmintrin.h
Arithmetic - - - - Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 1 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +
+ + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] * b[i+31:i] ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + SSE +
xmmintrin.h
Arithmetic - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 15 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:128] := 0 +
+ + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := a[31:0] / b[31:0] +dst[127:32] := a[127:32] -
immintrin.h
-
- - Integer + SSE +
xmmintrin.h
Arithmetic - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 7 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +
+ + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := a[i+31:i] / b[i+31:i] ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + SSE +
xmmintrin.h
Arithmetic - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + SSE - Arithmetic - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 1 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +
xmmintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
xmmintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
xmmintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) + i := j*16 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - - Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". +
xmmintrin.h
+ Probability/Statistics +
+ + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) - MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - - Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". +
xmmintrin.h
+ Convert +
+ + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SIND(a[i+63:i]) -ENDFOR +
xmmintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SIND(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +
xmmintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +dst[95:64] := a[95:64] +dst[127:96] := a[127:96] -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
xmmintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SINH(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +dst[95:64] := a[95:64] +dst[127:96] := a[127:96] -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
xmmintrin.h
+ Convert +
+ + + + Convert packed 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SINH(a[i+31:i]) + i := j*16 + m := j*32 + dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. +
xmmintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) +FOR j := 0 to 3 + i := j*16 + m := j*32 + dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. +
xmmintrin.h
+ Convert +
+ + + + Convert the lower packed 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) + i := j*8 + m := j*32 + dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. +
xmmintrin.h
+ Convert +
+ + + + Convert the lower packed unsigned 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) +FOR j := 0 to 3 + i := j*8 + m := j*32 + dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. +
xmmintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", then covert the packed signed 32-bit integers in "b" to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := Convert_Int32_To_FP32(a[31:0]) +dst[63:32] := Convert_Int32_To_FP32(a[63:32]) +dst[95:64] := Convert_Int32_To_FP32(b[31:0]) +dst[127:96] := Convert_Int32_To_FP32(b[63:32]) -
immintrin.h
-
- - Floating Point SSE - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) -
immintrin.h
-
- - Floating Point + SSE - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ROUND(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd". +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +dst[63:0] := Convert_FP32_To_Int64(a[31:0]) -
immintrin.h
-
- - Floating Point + SSE - Elementary Math Functions - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". +
xmmintrin.h
+ Convert +
+ + + + Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) +dst[31:0] := a[31:0] + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := TAN(a[i+63:i]) + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := TAN(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := TAND(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := TAND(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := TANH(a[i+63:i]) + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Trigonometry - - - Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := TANH(a[i+31:i]) +FOR j := 0 to 1 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point + SSE - Miscellaneous - - - Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := TRUNCATE(a[i+63:i]) +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. + +FOR j := 0 to 3 + i := 16*j + k := 32*j + IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF) + dst[i+15:i] := 0x7FFF + ELSE + dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k]) + FI ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Floating Point SSE - Miscellaneous - - - Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := TRUNCATE(a[i+31:i]) +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 8-bit integers, and store the results in lower 4 elements of "dst". Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. + +FOR j := 0 to 3 + i := 8*j + k := 32*j + IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF) + dst[i+7:i] := 0x7F + ELSE + dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k]) + FI ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Integer SSE - Arithmetic - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 +
xmmintrin.h
+ Convert +
+ + + + + Store 64-bits of integer data from "a" into memory using a non-temporal memory hint. + +MEM[mem_addr+63:mem_addr] := a[63:0] -
immintrin.h
-
- - Integer + SSE - Arithmetic - - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Integer - SSE - Arithmetic - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) + Store + + + + + + + Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. + +FOR j := 0 to 7 + i := j*8 + IF mask[i+7] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + FI ENDFOR -dst[MAX:128] := 0 -
immintrin.h
-
- - Integer + SSE +
immintrin.h
Store +
+ - - - Store 16-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + + + + Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). -MEM[mem_addr+15:mem_addr] := a[15:0] +FOR j := 0 to 7 + i := j*8 + IF mask[i+7] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + FI +ENDFOR -
immintrin.h
-
- - Integer + SSE - Load - - - Load unaligned 64-bit integer from memory into the first element of "dst". +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[MAX:64] := 0 +MEM[mem_addr+127:mem_addr] := a[127:0] - -
immintrin.h
-
- - Integer + SSE +
immintrin.h
Store +
+ - - - Store 64-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + + + Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory. -MEM[mem_addr+63:mem_addr] := a[63:0] +MEM[mem_addr+31:mem_addr] := a[95:64] +MEM[mem_addr+63:mem_addr+32] := a[127:96] - -
immintrin.h
-
- - Integer + SSE - Load - - - Load unaligned 16-bit integer from memory into the first element of "dst". +
immintrin.h
+ Store +
+ + + + + Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory. -dst[15:0] := MEM[mem_addr+15:mem_addr] -dst[MAX:16] := 0 +MEM[mem_addr+31:mem_addr] := a[31:0] +MEM[mem_addr+63:mem_addr+32] := a[63:32] + + SSE
immintrin.h
-
- - Floating Point - SSE2 - General Support - - - Return vector of type __m128d with undefined elements. -
immintrin.h
-
- - Integer - SSE2 - General Support - - - Return vector of type __m128i with undefined elements. -
immintrin.h
-
- - Integer - SSE2 - Load - - - Load unaligned 32-bit integer from memory into the first element of "dst". + Store + + + + + + Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[MAX:32] := 0 +MEM[mem_addr+31:mem_addr] := a[31:0] - + + SSE
immintrin.h
-
- - Integer - SSE2 Store + + - - - Store 32-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + + + Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. MEM[mem_addr+31:mem_addr] := a[31:0] +MEM[mem_addr+63:mem_addr+32] := a[31:0] +MEM[mem_addr+95:mem_addr+64] := a[31:0] +MEM[mem_addr+127:mem_addr+96] := a[31:0] - + SSE
immintrin.h
-
- - SSE2 - General Support - - - Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops. - -
emmintrin.h
-
- - SSE2 - General Support - - - Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy. - -
emmintrin.h
-
- - SSE2 - General Support - - - Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order. - -
emmintrin.h
-
- - SSE2 - General Support + Store + + - - Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order. - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". + + + Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] -ENDFOR +MEM[mem_addr+31:mem_addr] := a[31:0] +MEM[mem_addr+63:mem_addr+32] := a[31:0] +MEM[mem_addr+95:mem_addr+64] := a[31:0] +MEM[mem_addr+127:mem_addr+96] := a[31:0] - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". + SSE +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] -ENDFOR +MEM[mem_addr+127:mem_addr] := a[127:0] - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst". + + SSE +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR +MEM[mem_addr+127:mem_addr] := a[127:0] - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Add 64-bit integers "a" and "b", and store the result in "dst". + + SSE +
immintrin.h
+ Store +
+ + + + + Store 4 single-precision (32-bit) floating-point elements from "a" into memory in reverse order. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[63:0] := a[63:0] + b[63:0] +MEM[mem_addr+31:mem_addr] := a[127:96] +MEM[mem_addr+63:mem_addr+32] := a[95:64] +MEM[mem_addr+95:mem_addr+64] := a[63:32] +MEM[mem_addr+127:mem_addr+96] := a[31:0] - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst". + + SSE +
immintrin.h
+ Store +
+ + + + + Store 16-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR +MEM[mem_addr+15:mem_addr] := a[15:0] - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + SSE +
immintrin.h
+ Store +
+ + + + + Store 64-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) -ENDFOR +MEM[mem_addr+63:mem_addr] := a[63:0] - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + + SSE +
immintrin.h
+ Store +
+ + + + Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 15 i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + dst[j] := a[i+7] ENDFOR +dst[MAX:8] := 0 - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + + SSE +
xmmintrin.h
+ Miscellaneous +
+ + + + Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Probability/Statistics - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Probability/Statistics - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + dst[j] := a[i+7] ENDFOR +dst[MAX:8] := 0 - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + + SSE +
xmmintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Special Math Functions - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Special Math Functions - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Special Math Functions - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Special Math Functions - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Multiply the low unsigned 32-bit integers from "a" and "b", and store the unsigned 64-bit result in "dst". - -dst[63:0] := a[31:0] * b[31:0] - - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+31:i] * b[i+31:i] -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - Miscellaneous - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". - -FOR j := 0 to 15 - i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -ENDFOR -FOR j := 0 to 1 - i := j*64 - dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ - tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] - dst[i+63:i+16] := 0 -ENDFOR - - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] + IF a[i+31] + dst[j] := 1 + ELSE + dst[j] := 0 + FI ENDFOR +dst[MAX:4] := 0 - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + + SSE +
xmmintrin.h
+ Miscellaneous +
+ + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] -ENDFOR +dst[31:0] := SQRT(a[31:0]) +dst[127:32] := a[127:32] - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] + dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Subtract 64-bit integer "b" from 64-bit integer "a", and store the result in "dst". + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. -dst[63:0] := a[63:0] - b[63:0] +dst[31:0] := (1.0 / a[31:0]) +dst[127:32] := a[127:32] - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) -ENDFOR +dst[31:0] := (1.0 / SQRT(a[31:0])) +dst[127:32] := a[127:32] - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Arithmetic - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + + SSE +
xmmintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Logical +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] >> (tmp*8) +dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". -FOR j := 0 to 7 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 7 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI -ENDFOR +dst[31:0] := ( a[31:0] < b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". FOR j := 0 to 3 i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI + dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := ( a[31:0] <= b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". FOR j := 0 to 3 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI + dst[i+31:i] := ( a[i+31:i] <= b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 1 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI -ENDFOR +dst[31:0] := ( a[31:0] > b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 7 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR +dst[31:0] := ( a[31:0] >= b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". -FOR j := 0 to 7 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] >= b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". FOR j := 0 to 3 i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI + dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (!( a[31:0] < b[31:0] )) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". FOR j := 0 to 3 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI + dst[i+31:i] := !( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] >> (tmp*8) +dst[31:0] := (!( a[31:0] <= b[31:0] )) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". -FOR j := 0 to 7 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (!( a[i+31:i] <= b[i+31:i] )) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -FOR j := 0 to 7 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR +dst[31:0] := (!( a[31:0] > b[31:0] )) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". FOR j := 0 to 3 i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI + dst[i+31:i] := (!( a[i+31:i] > b[i+31:i] )) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (!( a[31:0] >= b[31:0] )) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". FOR j := 0 to 3 i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI + dst[i+31:i] := (!( a[i+31:i] >= b[i+31:i] )) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Shift - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + dst[31:0] := ( a[31:0] == NaN OR b[31:0] == NaN ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == NaN OR b[i+31:i] == NaN ) ? 0xFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Integer - SSE2 - Logical - - - - Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[127:0] := (a[127:0] AND b[127:0]) + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 - Logical - - - - Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - -dst[127:0] := ((NOT a[127:0]) AND b[127:0]) + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] < b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] <= b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 - Logical - - - - Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[127:0] := (a[127:0] OR b[127:0]) + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] > b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 - Logical - - - - Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[127:0] := (a[127:0] XOR b[127:0]) + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] >= b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 + + SSE +
xmmintrin.h
Compare - - - - Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 -ENDFOR +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). + RETURN ( a[31:0] == NaN OR b[31:0] == NaN OR a[31:0] != b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 + + SSE +
xmmintrin.h
Compare - - - - Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 + + SSE +
xmmintrin.h
Compare - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] < b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 + + SSE +
xmmintrin.h
Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 -ENDFOR +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] <= b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 + + SSE +
xmmintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] > b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 + + SSE +
xmmintrin.h
Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] >= b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 + + SSE +
xmmintrin.h
Compare - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched. - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := ( a[i+7:i] < b[i+7:i] ) ? 0xFF : 0 -ENDFOR +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] == NaN OR b[31:0] == NaN OR a[31:0] != b[31:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Integer - SSE2 + + SSE +
xmmintrin.h
Compare - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched. +
+ + + + Copy single-precision (32-bit) floating-point element "a" to the lower element of "dst", and zero the upper 3 elements. -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ( a[i+15:i] < b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR +dst[31:0] := a[31:0] +dst[127:32] := 0 - -
emmintrin.h
-
- - Integer - SSE2 - Compare - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched. + SSE +
xmmintrin.h
+ Set +
+ + + + Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". FOR j := 0 to 3 i := j*32 - dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 + dst[i+31:i] := a[31:0] ENDFOR - -
emmintrin.h
-
- - Floating Point - Integer - SSE2 - Convert - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + SSE +
xmmintrin.h
+ Set +
+ + + + Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". -FOR j := 0 to 1 +FOR j := 0 to 3 i := j*32 - m := j*64 - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + dst[i+31:i] := a[31:0] ENDFOR - -
emmintrin.h
-
- - Floating Point - SSE2 - Convert - - - - Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + SSE +
xmmintrin.h
+ Set +
+ + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. -dst[63:0] := Convert_Int32_To_FP64(b[31:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 - -
emmintrin.h
-
- - Floating Point - Integer - SSE2 - Convert - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + SSE +
xmmintrin.h
+ Set +
+ + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[31:0] := e3 +dst[63:32] := e2 +dst[95:64] := e1 +dst[127:96] := e0 - -
emmintrin.h
-
- - Floating Point - Integer - SSE2 - Convert - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + SSE +
xmmintrin.h
+ Set +
+ + + + Return vector of type __m128 with all elements set to zero. -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 +dst[MAX:0] := 0 - -
emmintrin.h
-
- - Floating Point - Integer - SSE2 - Convert - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + SSE +
xmmintrin.h
+ Set +
+ + + + + Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) -ENDFOR +dst[31:0] := a[31:0] +dst[63:32] := a[63:32] +dst[95:64] := MEM[mem_addr+31:mem_addr] +dst[127:96] := MEM[mem_addr+63:mem_addr+32] - -
emmintrin.h
-
- - Floating Point - SSE2 - Convert - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + + SSE +
immintrin.h
+ Load +
+ + + + + Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. -FOR j := 0 to 1 - i := j*32 - m := j*64 - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) -ENDFOR +dst[31:0] := MEM[mem_addr+31:mem_addr] +dst[63:32] := MEM[mem_addr+63:mem_addr+32] +dst[95:64] := a[95:64] +dst[127:96] := a[127:96] - -
emmintrin.h
-
- - Integer - SSE2 - Convert - - - Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst". + + SSE +
immintrin.h
+ Load +
+ + + + Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary. -dst[31:0] := a[31:0] +dst[31:0] := MEM[mem_addr+31:mem_addr] dst[127:32] := 0 - -
emmintrin.h
-
- - Integer - SSE2 - Convert - - - Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element. + + SSE +
immintrin.h
+ Load +
+ + + + Load a single-precision (32-bit) floating-point element from memory into all elements of "dst". -dst[63:0] := a[63:0] -dst[127:64] := 0 +dst[31:0] := MEM[mem_addr+31:mem_addr] +dst[63:32] := MEM[mem_addr+31:mem_addr] +dst[95:64] := MEM[mem_addr+31:mem_addr] +dst[127:96] := MEM[mem_addr+31:mem_addr] - -
emmintrin.h
-
- - Integer - SSE2 - Convert - - - Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element. + SSE +
immintrin.h
+ Load +
+ + + + Load a single-precision (32-bit) floating-point element from memory into all elements of "dst". -dst[63:0] := a[63:0] -dst[127:64] := 0 +dst[31:0] := MEM[mem_addr+31:mem_addr] +dst[63:32] := MEM[mem_addr+31:mem_addr] +dst[95:64] := MEM[mem_addr+31:mem_addr] +dst[127:96] := MEM[mem_addr+31:mem_addr] - -
emmintrin.h
-
- - Integer - SSE2 - Convert - - - Copy the lower 32-bit integer in "a" to "dst". + SSE +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[31:0] := a[31:0] +dst[127:0] := MEM[mem_addr+127:mem_addr] - -
emmintrin.h
-
- - Integer - SSE2 - Convert - - - Copy the lower 64-bit integer in "a" to "dst". + + SSE +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -dst[63:0] := a[63:0] +dst[127:0] := MEM[mem_addr+127:mem_addr] - -
emmintrin.h
-
- - Integer - SSE2 - Convert - - - Copy the lower 64-bit integer in "a" to "dst". + + SSE +
immintrin.h
+ Load +
+ + + + Load 4 single-precision (32-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[63:0] := a[63:0] +dst[31:0] := MEM[mem_addr+127:mem_addr+96] +dst[63:32] := MEM[mem_addr+95:mem_addr+64] +dst[95:64] := MEM[mem_addr+63:mem_addr+32] +dst[127:96] := MEM[mem_addr+31:mem_addr] - -
emmintrin.h
-
- - Integer - SSE2 - Set - - - - Set packed 64-bit integers in "dst" with the supplied values. + SSE +
immintrin.h
+ Load +
+ + + + Load unaligned 64-bit integer from memory into the first element of "dst". -dst[63:0] := e0 -dst[127:64] := e1 +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[MAX:64] := 0 -
emmintrin.h
-
- - Integer - SSE2 - Set - - - - Set packed 64-bit integers in "dst" with the supplied values. + + SSE +
immintrin.h
+ Load +
+ + + + Load unaligned 16-bit integer from memory into the first element of "dst". -dst[63:0] := e0 -dst[127:64] := e1 +dst[15:0] := MEM[mem_addr+15:mem_addr] +dst[MAX:16] := 0 -
emmintrin.h
-
- - Integer - SSE2 - Set - - - - - - Set packed 32-bit integers in "dst" with the supplied values. + SSE +
immintrin.h
+ Load +
+ + + + + Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 +dst[31:0] := b[31:0] +dst[127:32] := a[127:32] -
emmintrin.h
-
- - Integer - SSE2 - Set - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values. + + SSE +
xmmintrin.h
+ Move +
+ + + + + Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst". -dst[15:0] := e0 -dst[31:16] := e1 -dst[47:32] := e2 -dst[63:48] := e3 -dst[79:64] := e4 -dst[95:80] := e5 -dst[111:96] := e6 -dst[127:112] := e7 +dst[31:0] := b[95:64] +dst[63:32] := b[127:96] +dst[95:64] := a[95:64] +dst[127:96] := a[127:96] -
emmintrin.h
-
- - Integer - SSE2 - Set - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values. + + SSE +
xmmintrin.h
+ Move +
+ + + + + Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst". -dst[7:0] := e0 -dst[15:8] := e1 -dst[23:16] := e2 -dst[31:24] := e3 -dst[39:32] := e4 -dst[47:40] := e5 -dst[55:48] := e6 -dst[63:56] := e7 -dst[71:64] := e8 -dst[79:72] := e9 -dst[87:80] := e10 -dst[95:88] := e11 -dst[103:96] := e12 -dst[111:104] := e13 -dst[119:112] := e14 -dst[127:120] := e15 +dst[31:0] := a[31:0] +dst[63:32] := a[63:32] +dst[95:64] := b[31:0] +dst[127:96] := b[63:32] + + SSE +
xmmintrin.h
+ Move +
+ + + + + + Return vector of type __m128d with undefined elements. + SSE2
emmintrin.h
-
- - Integer + General Support + + + + + Return vector of type __m128i with undefined elements. SSE2 - Set - - - Broadcast 64-bit integer "a" to all elements of "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -
emmintrin.h
-
- - Integer + General Support + + + + + Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops. + SSE2 - Set - - - Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -
emmintrin.h
-
- - Integer + General Support + + + + + Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy. + SSE2 - Set - - - Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -
emmintrin.h
-
- - Integer + General Support + + + + + Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order. + SSE2 - Set - - - Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR -
emmintrin.h
-
- - Integer + General Support + + + + + Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order. + SSE2 - Set - - - Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb". +
emmintrin.h
+ General Support +
+ + + + Load unaligned 32-bit integer from memory into the first element of "dst". -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR +dst[31:0] := MEM[mem_addr+31:mem_addr] +dst[MAX:32] := 0 -
emmintrin.h
-
- - Integer + SSE2 - Set - - - - Set packed 64-bit integers in "dst" with the supplied values in reverse order. +
emmintrin.h
+ Load +
+ + + + Load 64-bit integer from memory into the first element of "dst". -dst[63:0] := e1 -dst[127:64] := e0 +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[MAX:64] := 0 -
emmintrin.h
-
- - Integer + SSE2 - Set - - - - - - Set packed 32-bit integers in "dst" with the supplied values in reverse order. +
emmintrin.h
+ Load +
+ + + + Load 128-bits of integer data from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[31:0] := e3 -dst[63:32] := e2 -dst[95:64] := e1 -dst[127:96] := e0 +dst[127:0] := MEM[mem_addr+127:mem_addr] -
emmintrin.h
-
- - Integer + SSE2 - Set - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values in reverse order. +
emmintrin.h
+ Load +
+ + + + Load 128-bits of integer data from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. -dst[15:0] := e7 -dst[31:16] := e6 -dst[47:32] := e5 -dst[63:48] := e4 -dst[79:64] := e3 -dst[95:80] := e2 -dst[111:96] := e1 -dst[127:112] := e0 +dst[127:0] := MEM[mem_addr+127:mem_addr] + + SSE2
emmintrin.h
-
- - Integer + Load + + + + + Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + SSE2 - Set - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values in reverse order. +
emmintrin.h
+ Load +
+ + + + Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". -dst[7:0] := e15 -dst[15:8] := e14 -dst[23:16] := e13 -dst[31:24] := e12 -dst[39:32] := e11 -dst[47:40] := e10 -dst[55:48] := e9 -dst[63:56] := e8 -dst[71:64] := e7 -dst[79:72] := e6 -dst[87:80] := e5 -dst[95:88] := e4 -dst[103:96] := e3 -dst[111:104] := e2 -dst[119:112] := e1 -dst[127:120] := e0 +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[127:64] := MEM[mem_addr+63:mem_addr] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". + +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[127:64] := MEM[mem_addr+63:mem_addr] -
emmintrin.h
-
- - Integer + SSE2 - Set - - Return vector of type __m128i with all elements set to zero. +
emmintrin.h
+ Load +
+ + + + Load 2 double-precision (64-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[MAX:0] := 0 +dst[63:0] := MEM[mem_addr+127:mem_addr+64] +dst[127:64] := MEM[mem_addr+63:mem_addr] - + + SSE2
emmintrin.h
-
- - Integer + Load + + + + + Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + SSE2 +
emmintrin.h
Load - - - Load 64-bit integer from memory into the first element of "dst". +
+ + + + Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary. dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[MAX:64] := 0 +dst[127:64] := 0 - -
emmintrin.h
-
- - Integer + SSE2 +
emmintrin.h
Load - - - Load 128-bits of integer data from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. +
+ + + + + Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. -dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[63:0] := a[63:0] +dst[127:64] := MEM[mem_addr+63:mem_addr] - -
emmintrin.h
-
- - Integer + SSE2 +
emmintrin.h
Load - - - Load 128-bits of integer data from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. +
+ + + + + Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. -dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[127:64] := a[127:64] - + + SSE2
emmintrin.h
-
- - Integer + Load + + + + + + Store 32-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+31:mem_addr] := a[31:0] + + SSE2 +
emmintrin.h
Store +
+ - - - + + + Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. "mem_addr" does not need to be aligned on any particular boundary. FOR j := 0 to 15 @@ -141999,545 +161880,632 @@ FOR j := 0 to 15 FI ENDFOR - -
emmintrin.h
-
- - Integer + SSE2 +
emmintrin.h
Store +
+ - - + + Store 128-bits of integer data from "a" into memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. MEM[mem_addr+127:mem_addr] := a[127:0] - -
emmintrin.h
-
- - Integer + SSE2 +
emmintrin.h
Store +
+ - - + + Store 128-bits of integer data from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. MEM[mem_addr+127:mem_addr] := a[127:0] - -
emmintrin.h
-
- - Integer + SSE2 +
emmintrin.h
Store +
+ - - + + Store 64-bit integer from the first element of "a" into memory. MEM[mem_addr+63:mem_addr] := a[63:0] - -
emmintrin.h
-
- - Integer + SSE2 +
emmintrin.h
Store +
+ - - + + Store 128-bits of integer data from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. MEM[mem_addr+127:mem_addr] := a[127:0] - -
emmintrin.h
-
- - Integer + SSE2 +
emmintrin.h
Store +
+ - - + + Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated. MEM[mem_addr+31:mem_addr] := a[31:0] - -
emmintrin.h
-
- - Integer + SSE2 +
emmintrin.h
Store +
+ - - + + Store 64-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated. MEM[mem_addr+63:mem_addr] := a[63:0] - -
emmintrin.h
-
- - Integer + SSE2 - Miscellaneous - - - Copy the lower 64-bit integer in "a" to "dst". +
emmintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[63:0] := a[63:0] +MEM[mem_addr+127:mem_addr] := a[127:0] - -
emmintrin.h
-
- - Integer + SSE2 - Move - - - Copy the 64-bit integer "a" to the lower element of "dst", and zero the upper element. +
emmintrin.h
+ Store +
+ + + + + Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. -dst[63:0] := a[63:0] -dst[127:64] := 0 +MEM[mem_addr+63:mem_addr] := a[63:0] - -
emmintrin.h
-
- - Integer + SSE2 - Move - - - Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element. +
emmintrin.h
+ Store +
+ + + + + Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[63:0] := a[63:0] -dst[127:64] := 0 +MEM[mem_addr+63:mem_addr] := a[63:0] +MEM[mem_addr+127:mem_addr+64] := a[63:0] - + SSE2
emmintrin.h
-
- - Integer + Store + + + + + + Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+63:mem_addr] := a[63:0] +MEM[mem_addr+127:mem_addr+64] := a[63:0] + SSE2 - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". +
emmintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[7:0] := Saturate8(a[15:0]) -dst[15:8] := Saturate8(a[31:16]) -dst[23:16] := Saturate8(a[47:32]) -dst[31:24] := Saturate8(a[63:48]) -dst[39:32] := Saturate8(a[79:64]) -dst[47:40] := Saturate8(a[95:80]) -dst[55:48] := Saturate8(a[111:96]) -dst[63:56] := Saturate8(a[127:112]) -dst[71:64] := Saturate8(b[15:0]) -dst[79:72] := Saturate8(b[31:16]) -dst[87:80] := Saturate8(b[47:32]) -dst[95:88] := Saturate8(b[63:48]) -dst[103:96] := Saturate8(b[79:64]) -dst[111:104] := Saturate8(b[95:80]) -dst[119:112] := Saturate8(b[111:96]) -dst[127:120] := Saturate8(b[127:112]) +MEM[mem_addr+127:mem_addr] := a[127:0] - + + SSE2
emmintrin.h
-
- - Integer + Store + + + + + + Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + SSE2 - Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". +
emmintrin.h
+ Store +
+ + + + + Store 2 double-precision (64-bit) floating-point elements from "a" into memory in reverse order. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(a[95:64]) -dst[63:48] := Saturate16(a[127:96]) -dst[79:64] := Saturate16(b[31:0]) -dst[95:80] := Saturate16(b[63:32]) -dst[111:96] := Saturate16(b[95:64]) -dst[127:112] := Saturate16(b[127:96]) +MEM[mem_addr+63:mem_addr] := a[127:64] +MEM[mem_addr+127:mem_addr+64] := a[63:0] - + SSE2
emmintrin.h
-
- - Integer + Store + + + + + + Store the upper double-precision (64-bit) floating-point element from "a" into memory. + +MEM[mem_addr+63:mem_addr] := a[127:64] + + SSE2 - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". +
emmintrin.h
+ Store +
+ + + + + Store the lower double-precision (64-bit) floating-point element from "a" into memory. -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(a[79:64]) -dst[47:40] := SaturateU8(a[95:80]) -dst[55:48] := SaturateU8(a[111:96]) -dst[63:56] := SaturateU8(a[127:112]) -dst[71:64] := SaturateU8(b[15:0]) -dst[79:72] := SaturateU8(b[31:16]) -dst[87:80] := SaturateU8(b[47:32]) -dst[95:88] := SaturateU8(b[63:48]) -dst[103:96] := SaturateU8(b[79:64]) -dst[111:104] := SaturateU8(b[95:80]) -dst[119:112] := SaturateU8(b[111:96]) -dst[127:120] := SaturateU8(b[127:112]) +MEM[mem_addr+63:mem_addr] := a[63:0] - + + SSE2
emmintrin.h
-
- - Integer + Store + + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] +ENDFOR + + SSE2 - Swizzle - - - - Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". -dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0] -dst[31:16] := 0 +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] +ENDFOR - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR + + SSE2 - Swizzle - - - - - Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". +
emmintrin.h
+ Arithmetic +
+ + + + + Add 64-bit integers "a" and "b", and store the result in "dst". -dst[127:0] := a[127:0] -sel := imm8[2:0]*16 -dst[sel+15:sel] := i[15:0] +dst[63:0] := a[63:0] + b[63:0] - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR + + SSE2 - Miscellaneous - - - Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". FOR j := 0 to 15 i := j*8 - dst[j] := a[i+7] + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) ENDFOR -dst[MAX:16] := 0 - -
emmintrin.h
-
- - Integer + SSE2 - Swizzle - - - - Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR - -
emmintrin.h
-
- - Integer + SSE2 - Swizzle - - - - Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". -dst[63:0] := a[63:0] -dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +ENDFOR - -
emmintrin.h
-
- - Integer + SSE2 - Swizzle - - - - Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". -dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -dst[127:64] := a[127:64] +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +ENDFOR - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +ENDFOR + + SSE2 - Swizzle - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] +ENDFOR - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] +ENDFOR + + SSE2 - Swizzle - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] +ENDFOR - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + + + + Multiply the low unsigned 32-bit integers from "a" and "b", and store the unsigned 64-bit result in "dst". + +dst[63:0] := a[31:0] * b[31:0] + + SSE2 - Swizzle - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+31:i] * b[i+31:i] +ENDFOR - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + Miscellaneous + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". + +FOR j := 0 to 15 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +ENDFOR +FOR j := 0 to 1 + i := j*64 + dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] + dst[i+63:i+16] := 0 +ENDFOR + + SSE2 - Swizzle - - - - Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] +ENDFOR - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := a[i+15:i] - b[i+15:i] +ENDFOR + + SSE2 - Swizzle - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + + + + Subtract 64-bit integer "b" from 64-bit integer "a", and store the result in "dst". + +dst[63:0] := a[63:0] - b[63:0] + + SSE2 - Swizzle - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +ENDFOR + + SSE2 - Swizzle - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) +ENDFOR - + + SSE2
emmintrin.h
-
- - Integer + Arithmetic + + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +ENDFOR + + SSE2 - Swizzle - - - - Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst". +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Arithmetic - - - +
+ + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := a[63:0] + b[63:0] dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Arithmetic - - - +
+ + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 1 @@ -142545,31 +162513,29 @@ FOR j := 0 to 1 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Arithmetic - - - +
+ + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := a[63:0] / b[63:0] dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Arithmetic - - - +
+ + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". FOR j := 0 to 1 @@ -142577,175 +162543,671 @@ FOR j := 0 to 1 dst[i+63:i] := a[i+63:i] / b[i+63:i] ENDFOR - + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := a[63:0] * b[63:0] +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] * b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := a[63:0] - b[63:0] +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +ENDFOR + + + SSE2 +
emmintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +ENDFOR + + + SSE2 +
emmintrin.h
+ Probability/Statistics +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE2
emmintrin.h
-
- - Floating Point + Special Math Functions + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR + + SSE2 +
emmintrin.h
Special Math Functions - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [max_float_note] dst[63:0] := MAX(a[63:0], b[63:0]) dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Special Math Functions - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] FOR j := 0 to 1 i := j*64 dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Special Math Functions - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [min_float_note] dst[63:0] := MIN(a[63:0], b[63:0]) dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Special Math Functions - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] FOR j := 0 to 1 i := j*64 dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR - + + SSE2
emmintrin.h
-
- - Floating Point + Special Math Functions + + + + + + Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) + + SSE2 - Arithmetic - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
emmintrin.h
+ Shift +
+ + + + + Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". -dst[63:0] := a[63:0] * b[63:0] -dst[127:64] := a[127:64] +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) - + + SSE2
emmintrin.h
-
- - Floating Point + Shift + + + + + + Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] >> (tmp*8) + + SSE2 - Arithmetic - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] * b[i+63:i] +FOR j := 0 to 7 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI ENDFOR - + + SSE2
emmintrin.h
-
- - Floating Point + Shift + + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI +ENDFOR + + SSE2 - Elementary Math Functions - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". -dst[63:0] := SQRT(b[63:0]) -dst[127:64] := a[127:64] +FOR j := 0 to 3 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI +ENDFOR - + + SSE2
emmintrin.h
-
- - Floating Point + Shift + + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI +ENDFOR + + SSE2 - Elementary Math Functions - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
emmintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI ENDFOR - + + SSE2
emmintrin.h
-
- - Floating Point + Shift + + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI +ENDFOR + + SSE2 - Arithmetic - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". -dst[63:0] := a[63:0] - b[63:0] -dst[127:64] := a[127:64] +FOR j := 0 to 7 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR - + + SSE2
emmintrin.h
-
- - Floating Point + Shift + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR + + SSE2 - Arithmetic - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] >> (tmp*8) + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI ENDFOR - + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[127:0] := (a[127:0] AND b[127:0]) + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". + +dst[127:0] := ((NOT a[127:0]) AND b[127:0]) + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[127:0] := (a[127:0] OR b[127:0]) + + + SSE2
emmintrin.h
-
- - Floating Point + Logical + + + + + + Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[127:0] := (a[127:0] XOR b[127:0]) + + SSE2 +
emmintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 1 @@ -142753,16 +163215,15 @@ FOR j := 0 to 1 dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Logical - - - +
+ + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". FOR j := 0 to 1 @@ -142770,16 +163231,15 @@ FOR j := 0 to 1 dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Logical - - - +
+ + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 1 @@ -142787,16 +163247,15 @@ FOR j := 0 to 1 dst[i+63:i] := a[i+63:i] OR b[i+63:i] ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Logical - - - +
+ + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". FOR j := 0 to 1 @@ -142804,194 +163263,325 @@ FOR j := 0 to 1 dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ENDFOR - + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched. + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := ( a[i+7:i] < b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched. + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ( a[i+15:i] < b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + SSE2
emmintrin.h
-
- - Floating Point + Compare + + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (a[63:0] == b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (a[63:0] < b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (a[63:0] <= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (a[63:0] > b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (a[63:0] >= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (a[63:0] != NaN AND b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (a[63:0] == NaN OR b[63:0] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (a[63:0] != b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (!(a[63:0] < b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (!(a[63:0] <= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (!(a[63:0] > b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := (!(a[63:0] >= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". FOR j := 0 to 1 @@ -142999,16 +163589,15 @@ FOR j := 0 to 1 dst[i+63:i] := (a[i+63:i] == b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". FOR j := 0 to 1 @@ -143016,16 +163605,15 @@ FOR j := 0 to 1 dst[i+63:i] := (a[i+63:i] < b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". FOR j := 0 to 1 @@ -143033,16 +163621,15 @@ FOR j := 0 to 1 dst[i+63:i] := (a[i+63:i] <= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". FOR j := 0 to 1 @@ -143050,16 +163637,15 @@ FOR j := 0 to 1 dst[i+63:i] := (a[i+63:i] > b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". FOR j := 0 to 1 @@ -143067,48 +163653,45 @@ FOR j := 0 to 1 dst[i+63:i] := (a[i+63:i] >= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". FOR j := 0 to 1 i := j*64 dst[i+63:i] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". FOR j := 0 to 1 @@ -143116,16 +163699,15 @@ FOR j := 0 to 1 dst[i+63:i] := (a[i+63:i] != b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". FOR j := 0 to 1 @@ -143133,16 +163715,15 @@ FOR j := 0 to 1 dst[i+63:i] := (!(a[i+63:i] < b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". FOR j := 0 to 1 @@ -143150,16 +163731,15 @@ FOR j := 0 to 1 dst[i+63:i] := (!(a[i+63:i] <= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". FOR j := 0 to 1 @@ -143167,16 +163747,15 @@ FOR j := 0 to 1 dst[i+63:i] := (!(a[i+63:i] > b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". FOR j := 0 to 1 @@ -143184,195 +163763,325 @@ FOR j := 0 to 1 dst[i+63:i] := (!(a[i+63:i] >= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). - -RETURN ( a[63:0] == b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] == b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). - -RETURN ( a[63:0] < b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] < b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). - -RETURN ( a[63:0] <= b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] <= b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). - -RETURN ( a[63:0] > b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] > b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). - -RETURN ( a[63:0] >= b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] >= b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). - -RETURN ( a[63:0] != b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] == NaN OR b[63:0] == NaN OR a[63:0] != b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - -RETURN ( a[63:0] == b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] == b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - -RETURN ( a[63:0] < b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] < b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - -RETURN ( a[63:0] <= b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] <= b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - -RETURN ( a[63:0] > b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] > b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - -RETURN ( a[63:0] >= b[63:0] ) ? 1 : 0 + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] >= b[63:0] ) ? 1 : 0 - -
emmintrin.h
-
- - Floating Point - Flag + SSE2 +
emmintrin.h
Compare - - - +
+ + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[63:0] == NaN OR b[63:0] == NaN OR a[63:0] != b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + m := j*64 + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int32_To_FP64(b[31:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + m := j*64 + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst". + +dst[31:0] := a[31:0] +dst[127:32] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element. + +dst[63:0] := a[63:0] +dst[127:64] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element. + +dst[63:0] := a[63:0] +dst[127:64] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy the lower 32-bit integer in "a" to "dst". + +dst[31:0] := a[31:0] + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy the lower 64-bit integer in "a" to "dst". -RETURN ( a[63:0] != b[63:0] ) ? 1 : 0 +dst[63:0] := a[63:0] - + + SSE2
emmintrin.h
-
- - Floating Point + Convert + + + + + Copy the lower 64-bit integer in "a" to "dst". + +dst[63:0] := a[63:0] + + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 1 @@ -143382,15 +164091,14 @@ FOR j := 0 to 1 ENDFOR dst[127:64] := 0 - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". FOR j := 0 to 1 @@ -143399,16 +164107,14 @@ FOR j := 0 to 1 dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) ENDFOR - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 1 @@ -143417,103 +164123,92 @@ FOR j := 0 to 1 dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". dst[31:0] := Convert_FP64_To_Int32(a[63:0]) - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Convert - - - +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". dst[31:0] := Convert_FP64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0 - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Convert - - +
+ + + Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". dst[63:0] := a[63:0] - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Convert - - - +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". dst[63:0] := Convert_FP32_To_FP64(b[31:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0 - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". FOR j := 0 to 1 @@ -143522,58 +164217,50 @@ FOR j := 0 to 1 dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 3 @@ -143581,16 +164268,14 @@ FOR j := 0 to 3 dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". FOR j := 0 to 3 @@ -143598,16 +164283,14 @@ FOR j := 0 to 3 dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR - -
emmintrin.h
-
- - Floating Point - Integer + SSE2 +
emmintrin.h
Convert - - +
+ + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 1 @@ -143616,362 +164299,836 @@ FOR j := 0 to 1 dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR - -
emmintrin.h
-
- - Floating Point - Integer + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Set packed 64-bit integers in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + Set packed 64-bit integers in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + Set packed 32-bit integers in "dst" with the supplied values. + +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values. + +dst[15:0] := e0 +dst[31:16] := e1 +dst[47:32] := e2 +dst[63:48] := e3 +dst[79:64] := e4 +dst[95:80] := e5 +dst[111:96] := e6 +dst[127:112] := e7 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values. + +dst[7:0] := e0 +dst[15:8] := e1 +dst[23:16] := e2 +dst[31:24] := e3 +dst[39:32] := e4 +dst[47:40] := e5 +dst[55:48] := e6 +dst[63:56] := e7 +dst[71:64] := e8 +dst[79:72] := e9 +dst[87:80] := e10 +dst[95:88] := e11 +dst[103:96] := e12 +dst[111:104] := e13 +dst[119:112] := e14 +dst[127:120] := e15 + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 64-bit integer "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := a[7:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + + Set packed 64-bit integers in "dst" with the supplied values in reverse order. + +dst[63:0] := e1 +dst[127:64] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + Set packed 32-bit integers in "dst" with the supplied values in reverse order. + +dst[31:0] := e3 +dst[63:32] := e2 +dst[95:64] := e1 +dst[127:96] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values in reverse order. + +dst[15:0] := e7 +dst[31:16] := e6 +dst[47:32] := e5 +dst[63:48] := e4 +dst[79:64] := e3 +dst[95:80] := e2 +dst[111:96] := e1 +dst[127:112] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values in reverse order. + +dst[7:0] := e15 +dst[15:8] := e14 +dst[23:16] := e13 +dst[31:24] := e12 +dst[39:32] := e11 +dst[47:40] := e10 +dst[55:48] := e9 +dst[63:56] := e8 +dst[71:64] := e7 +dst[79:72] := e6 +dst[87:80] := e5 +dst[95:88] := e4 +dst[103:96] := e3 +dst[111:104] := e2 +dst[119:112] := e1 +dst[127:120] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + Return vector of type __m128i with all elements set to zero. + +dst[MAX:0] := 0 + + + SSE2 +
emmintrin.h
+ Set +
+ + + + Copy double-precision (64-bit) floating-point element "a" to the lower element of "dst", and zero the upper element. + +dst[63:0] := a[63:0] +dst[127:64] := 0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst[63:0] := e1 +dst[127:64] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + Return vector of type __m128d with all elements set to zero. + +dst[MAX:0] := 0 + + + SSE2 +
emmintrin.h
+ Set +
+ + + + Copy the lower 64-bit integer in "a" to "dst". + +dst[63:0] := a[63:0] + + + SSE2 +
emmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". + +dst[7:0] := Saturate8(a[15:0]) +dst[15:8] := Saturate8(a[31:16]) +dst[23:16] := Saturate8(a[47:32]) +dst[31:24] := Saturate8(a[63:48]) +dst[39:32] := Saturate8(a[79:64]) +dst[47:40] := Saturate8(a[95:80]) +dst[55:48] := Saturate8(a[111:96]) +dst[63:56] := Saturate8(a[127:112]) +dst[71:64] := Saturate8(b[15:0]) +dst[79:72] := Saturate8(b[31:16]) +dst[87:80] := Saturate8(b[47:32]) +dst[95:88] := Saturate8(b[63:48]) +dst[103:96] := Saturate8(b[79:64]) +dst[111:104] := Saturate8(b[95:80]) +dst[119:112] := Saturate8(b[111:96]) +dst[127:120] := Saturate8(b[127:112]) + + + SSE2 +
emmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". + +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(a[95:64]) +dst[63:48] := Saturate16(a[127:96]) +dst[79:64] := Saturate16(b[31:0]) +dst[95:80] := Saturate16(b[63:32]) +dst[111:96] := Saturate16(b[95:64]) +dst[127:112] := Saturate16(b[127:96]) + + + SSE2 +
emmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". + +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(a[79:64]) +dst[47:40] := SaturateU8(a[95:80]) +dst[55:48] := SaturateU8(a[111:96]) +dst[63:56] := SaturateU8(a[127:112]) +dst[71:64] := SaturateU8(b[15:0]) +dst[79:72] := SaturateU8(b[31:16]) +dst[87:80] := SaturateU8(b[47:32]) +dst[95:88] := SaturateU8(b[63:48]) +dst[103:96] := SaturateU8(b[79:64]) +dst[111:104] := SaturateU8(b[95:80]) +dst[119:112] := SaturateU8(b[111:96]) +dst[127:120] := SaturateU8(b[127:112]) + + SSE2 - Convert - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". +
emmintrin.h
+ Miscellaneous +
+ + + + Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) +FOR j := 0 to 15 + i := j*8 + dst[j] := a[i+7] ENDFOR +dst[MAX:16] := 0 - -
emmintrin.h
-
- - Floating Point + SSE2 - Set - - - Copy double-precision (64-bit) floating-point element "a" to the lower element of "dst", and zero the upper element. - -dst[63:0] := a[63:0] -dst[127:64] := 0 -
emmintrin.h
-
- - Floating Point - SSE2 - Set - - - Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". + Miscellaneous + + + + + Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := a[63:0] + IF a[i+63] + dst[j] := 1 + ELSE + dst[j] := 0 + FI ENDFOR +dst[MAX:2] := 0 -
emmintrin.h
-
- - Floating Point + SSE2 - Set - - - Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -
emmintrin.h
-
- - Floating Point - SSE2 - Set - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. + Miscellaneous + + + + + Copy the 64-bit integer "a" to the lower element of "dst", and zero the upper element. -dst[63:0] := e0 -dst[127:64] := e1 +dst[63:0] := a[63:0] +dst[127:64] := 0 -
emmintrin.h
-
- - Floating Point + SSE2 - Set - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst[63:0] := e1 -dst[127:64] := e0 -
emmintrin.h
-
- - Floating Point - SSE2 - Set - - - Return vector of type __m128d with all elements set to zero. + Move + + + + + Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element. -dst[MAX:0] := 0 +dst[63:0] := a[63:0] +dst[127:64] := 0 - -
emmintrin.h
-
- - Floating Point + SSE2 - Load - - - Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - -
emmintrin.h
-
- - Floating Point - SSE2 - Load - - - Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". + Move + + + + + + Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[127:64] := MEM[mem_addr+63:mem_addr] +dst[63:0] := b[63:0] +dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 - Load - - - Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". - -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[127:64] := MEM[mem_addr+63:mem_addr] - -
emmintrin.h
-
- - Floating Point - SSE2 - Load - - - Load 2 double-precision (64-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. + Move + + + + + + Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". -dst[63:0] := MEM[mem_addr+127:mem_addr+64] -dst[127:64] := MEM[mem_addr+63:mem_addr] +dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0] +dst[31:16] := 0 - -
emmintrin.h
-
- - Floating Point + SSE2 - Load - - - Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. +
emmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". -dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[127:0] := a[127:0] +sel := imm8[2:0]*16 +dst[sel+15:sel] := i[15:0] - -
emmintrin.h
-
- - Floating Point + SSE2 - Load - - - Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary. +
emmintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst". -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[127:64] := 0 +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) - -
emmintrin.h
-
- - Floating Point + SSE2 - Load - - - - Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. +
emmintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst". dst[63:0] := a[63:0] -dst[127:64] := MEM[mem_addr+63:mem_addr] +dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] - -
emmintrin.h
-
- - Floating Point + SSE2 - Load - - - - Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. +
emmintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst". -dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] dst[127:64] := a[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 - Store - - - - Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - -
emmintrin.h
-
- - Floating Point - SSE2 - Store - - - - Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + Swizzle + + + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". -MEM[mem_addr+63:mem_addr] := a[63:0] +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) - -
emmintrin.h
-
- - Floating Point + SSE2 - Store - - - - Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". -MEM[mem_addr+63:mem_addr] := a[63:0] -MEM[mem_addr+127:mem_addr+64] := a[63:0] +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -
emmintrin.h
-
- - Floating Point + SSE2 - Store - - - - Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". -MEM[mem_addr+63:mem_addr] := a[63:0] -MEM[mem_addr+127:mem_addr+64] := a[63:0] +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -
emmintrin.h
-
- - Floating Point + SSE2 - Store - - - - Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst". -MEM[mem_addr+127:mem_addr] := a[127:0] +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) - -
emmintrin.h
-
- - Floating Point + SSE2 - Store - - - - Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". -MEM[mem_addr+127:mem_addr] := a[127:0] +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) - -
emmintrin.h
-
- - Floating Point + SSE2 - Store - - - - Store 2 double-precision (64-bit) floating-point elements from "a" into memory in reverse order. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". -MEM[mem_addr+63:mem_addr] := a[127:64] -MEM[mem_addr+127:mem_addr+64] := a[63:0] +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -
emmintrin.h
-
- - Floating Point + SSE2 - Store - - - - Store the upper double-precision (64-bit) floating-point element from "a" into memory. +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". -MEM[mem_addr+63:mem_addr] := a[127:64] +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) - -
emmintrin.h
-
- - Floating Point + SSE2 - Store - - - - Store the lower double-precision (64-bit) floating-point element from "a" into memory. +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst". -MEM[mem_addr+63:mem_addr] := a[63:0] +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Swizzle - - - +
+ + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst". DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { @@ -143981,16 +165138,15 @@ DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) - -
emmintrin.h
-
- - Floating Point + SSE2 +
emmintrin.h
Swizzle - - - +
+ + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { @@ -144000,124 +165156,109 @@ DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) - -
emmintrin.h
-
- - Floating Point + SSE2 - Miscellaneous - - - Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". - -FOR j := 0 to 1 - i := j*64 - IF a[i+63] - dst[j] := 1 - ELSE - dst[j] := 0 - FI -ENDFOR -dst[MAX:2] := 0 - -
emmintrin.h
-
- - Floating Point - SSE2 Swizzle - - - - + + + + + + Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst". dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] - -
emmintrin.h
-
- - Floating Point + SSE2 - Move - - - - Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". +
emmintrin.h
+ Swizzle +
+ + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". -dst[63:0] := b[63:0] +dst[63:0] := SQRT(b[63:0]) dst[127:64] := a[127:64] - + + SSE2
emmintrin.h
-
- - Floating Point + Elementary Math Functions + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) +ENDFOR + + SSE2 - Cast - - - Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
emmintrin.h
-
- - Floating Point - Integer + Elementary Math Functions + + + + + Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. SSE2 +
emmintrin.h
Cast - - +
+ + + Cast vector of type __m128d to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
emmintrin.h
-
- - Floating Point SSE2 +
emmintrin.h
Cast - - +
+ + + Cast vector of type __m128 to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
emmintrin.h
-
- - Floating Point - Integer SSE2 +
emmintrin.h
Cast - - +
+ + + Cast vector of type __m128 to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
emmintrin.h
-
- - Floating Point SSE2 +
emmintrin.h
Cast - - +
+ + + Cast vector of type __m128i to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
emmintrin.h
-
- - Floating Point SSE2 +
emmintrin.h
Cast - - +
+ + + Cast vector of type __m128i to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + SSE2
emmintrin.h
-
- - Floating Point - SSE3 - Arithmetic - - - + Cast + + + + + + + Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". FOR j := 0 to 3 @@ -144129,16 +165270,15 @@ FOR j := 0 to 3 FI ENDFOR - -
pmmintrin.h
-
- - Floating Point + SSE3 +
pmmintrin.h
Arithmetic - - - +
+ + + + Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". FOR j := 0 to 1 @@ -144150,31 +165290,29 @@ FOR j := 0 to 1 FI ENDFOR - -
pmmintrin.h
-
- - Floating Point + SSE3 +
pmmintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". dst[63:0] := a[127:64] + a[63:0] dst[127:64] := b[127:64] + b[63:0] - -
pmmintrin.h
-
- - Floating Point + SSE3 +
pmmintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". dst[31:0] := a[63:32] + a[31:0] @@ -144182,31 +165320,29 @@ dst[63:32] := a[127:96] + a[95:64] dst[95:64] := b[63:32] + b[31:0] dst[127:96] := b[127:96] + b[95:64] - -
pmmintrin.h
-
- - Floating Point + SSE3 +
pmmintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". dst[63:0] := a[63:0] - a[127:64] dst[127:64] := b[63:0] - b[127:64] - -
pmmintrin.h
-
- - Floating Point + SSE3 +
pmmintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". dst[31:0] := a[31:0] - a[63:32] @@ -144214,56 +165350,52 @@ dst[63:32] := a[95:64] - a[127:96] dst[95:64] := b[31:0] - b[63:32] dst[127:96] := b[95:64] - b[127:96] - -
pmmintrin.h
-
- - Integer + SSE3 - Load - - +
pmmintrin.h
+ Arithmetic +
+ + + Load 128-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm_loadu_si128" when the data crosses a cache line boundary. dst[127:0] := MEM[mem_addr+127:mem_addr] - -
pmmintrin.h
-
- - Floating Point + SSE3 - Move - - - Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst". - -dst[63:0] := a[63:0] -dst[127:64] := a[63:0] - -
pmmintrin.h
-
- - Floating Point - SSE3 Load - - + + + + Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". dst[63:0] := MEM[mem_addr+63:mem_addr] dst[127:64] := MEM[mem_addr+63:mem_addr] - + + SSE3
pmmintrin.h
-
- - Floating Point + Load + + + + + Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst". + +dst[63:0] := a[63:0] +dst[127:64] := a[63:0] + + SSE3 +
pmmintrin.h
Move - - +
+ + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". dst[31:0] := a[63:32] @@ -144271,15 +165403,14 @@ dst[63:32] := a[63:32] dst[95:64] := a[127:96] dst[127:96] := a[127:96] - -
pmmintrin.h
-
- - Floating Point + SSE3 +
pmmintrin.h
Move - - +
+ + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". dst[31:0] := a[31:0] @@ -144287,17 +165418,18 @@ dst[63:32] := a[31:0] dst[95:64] := a[95:64] dst[127:96] := a[95:64] - + + SSE3
pmmintrin.h
-
- - Floating Point - SSE4.1 - Swizzle - - - - + Move + + + + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". FOR j := 0 to 1 @@ -144309,17 +165441,16 @@ FOR j := 0 to 1 FI ENDFOR - -
smmintrin.h
-
- - Floating Point + SSE4.1 +
smmintrin.h
Swizzle - - - - +
+ + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". FOR j := 0 to 3 @@ -144331,17 +165462,16 @@ FOR j := 0 to 3 FI ENDFOR - -
smmintrin.h
-
- - Floating Point + SSE4.1 +
smmintrin.h
Swizzle - - - - +
+ + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". FOR j := 0 to 1 @@ -144353,17 +165483,16 @@ FOR j := 0 to 1 FI ENDFOR - -
smmintrin.h
-
- - Floating Point + SSE4.1 +
smmintrin.h
Swizzle - - - - +
+ + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". FOR j := 0 to 3 @@ -144375,17 +165504,16 @@ FOR j := 0 to 3 FI ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Swizzle - - - - +
+ + + + + Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". FOR j := 0 to 15 @@ -144397,17 +165525,16 @@ FOR j := 0 to 15 FI ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Swizzle - - - - +
+ + + + + Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". FOR j := 0 to 7 @@ -144419,17 +165546,151 @@ FOR j := 0 to 7 FI ENDFOR - + + SSE4.1
smmintrin.h
-
- - Floating Point + Swizzle + + + + + + Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst". + +dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] + + SSE4.1 - Arithmetic - - - - +
smmintrin.h
+ Swizzle +
+ + + + + Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". + +dst[7:0] := (a[127:0] >> (imm8[3:0] * 8))[7:0] +dst[31:8] := 0 + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst". + +dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst". + +dst[63:0] := (a[127:0] >> (imm8[0] * 64))[63:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set). + +tmp2[127:0] := a[127:0] +CASE (imm8[7:6]) OF +0: tmp1[31:0] := b[31:0] +1: tmp1[31:0] := b[63:32] +2: tmp1[31:0] := b[95:64] +3: tmp1[31:0] := b[127:96] +ESAC +CASE (imm8[5:4]) OF +0: tmp2[31:0] := tmp1[31:0] +1: tmp2[63:32] := tmp1[31:0] +2: tmp2[95:64] := tmp1[31:0] +3: tmp2[127:96] := tmp1[31:0] +ESAC +FOR j := 0 to 3 + i := j*32 + IF imm8[j%8] + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := tmp2[i+31:i] + FI +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8". + +dst[127:0] := a[127:0] +sel := imm8[3:0]*8 +dst[sel+7:sel] := i[7:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8". + +dst[127:0] := a[127:0] +sel := imm8[1:0]*32 +dst[sel+31:sel] := i[31:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8". + +dst[127:0] := a[127:0] +sel := imm8[0]*64 +dst[sel+63:sel] := i[63:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { @@ -144456,17 +165717,16 @@ DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { } dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) - -
smmintrin.h
-
- - Floating Point + SSE4.1 +
smmintrin.h
Arithmetic - - - - +
+ + + + + Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { @@ -144493,159 +165753,76 @@ DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { } dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) - -
smmintrin.h
-
- - Floating Point + SSE4.1 - Swizzle - - - - Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst". - -dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] - -
smmintrin.h
-
- - Integer - SSE4.1 - Swizzle - - - - Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". - -dst[7:0] := (a[127:0] >> (imm8[3:0] * 8))[7:0] -dst[31:8] := 0 - - -
smmintrin.h
-
- - Integer - SSE4.1 - Swizzle - - - - Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst". + Arithmetic + + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". -dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) +ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 - Swizzle - - - - Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst". - -dst[63:0] := (a[127:0] >> (imm8[0] * 64))[63:0] - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Swizzle - - - - - Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set). + Arithmetic + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". -tmp2[127:0] := a[127:0] -CASE (imm8[7:6]) OF -0: tmp1[31:0] := b[31:0] -1: tmp1[31:0] := b[63:32] -2: tmp1[31:0] := b[95:64] -3: tmp1[31:0] := b[127:96] -ESAC -CASE (imm8[5:4]) OF -0: tmp2[31:0] := tmp1[31:0] -1: tmp2[63:32] := tmp1[31:0] -2: tmp2[95:64] := tmp1[31:0] -3: tmp2[127:96] := tmp1[31:0] -ESAC FOR j := 0 to 3 i := j*32 - IF imm8[j%8] - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := tmp2[i+31:i] - FI + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 - Swizzle - - - - - Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8". - -dst[127:0] := a[127:0] -sel := imm8[3:0]*8 -dst[sel+7:sel] := i[7:0] - -
smmintrin.h
-
- - Integer - SSE4.1 - Swizzle - - - - - Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8". + Arithmetic + + + Miscellaneous + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". + Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". -dst[127:0] := a[127:0] -sel := imm8[1:0]*32 -dst[sel+31:sel] := i[31:0] +DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) { + a_offset := imm8[2]*32 + b_offset := imm8[1:0]*32 + FOR j := 0 to 7 + i := j*8 + k := a_offset+i + l := b_offset + tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \ + ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24])) + ENDFOR + RETURN tmp[127:0] +} +dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0]) - -
smmintrin.h
-
- - Integer + SSE4.1 - Swizzle - - - - - Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8". - -dst[127:0] := a[127:0] -sel := imm8[0]*64 -dst[sel+63:sel] := i[63:0] - -
smmintrin.h
-
- - Integer - SSE4.1 - Special Math Functions - - - + Arithmetic + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". FOR j := 0 to 15 @@ -144653,16 +165830,15 @@ FOR j := 0 to 15 dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Special Math Functions - - - +
+ + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". FOR j := 0 to 3 @@ -144670,16 +165846,15 @@ FOR j := 0 to 3 dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Special Math Functions - - - +
+ + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". FOR j := 0 to 3 @@ -144687,16 +165862,15 @@ FOR j := 0 to 3 dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Special Math Functions - - - +
+ + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". FOR j := 0 to 7 @@ -144704,16 +165878,15 @@ FOR j := 0 to 7 dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Special Math Functions - - - +
+ + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". FOR j := 0 to 15 @@ -144721,16 +165894,15 @@ FOR j := 0 to 15 dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Special Math Functions - - - +
+ + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". FOR j := 0 to 3 @@ -144738,16 +165910,15 @@ FOR j := 0 to 3 dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Special Math Functions - - - +
+ + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". FOR j := 0 to 3 @@ -144755,16 +165926,15 @@ FOR j := 0 to 3 dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Special Math Functions - - - +
+ + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". FOR j := 0 to 7 @@ -144772,17 +165942,198 @@ FOR j := 0 to 7 dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) ENDFOR - + + SSE4.1
smmintrin.h
-
- - Integer + Special Math Functions + + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". + [round_note] + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ROUND(a[i+63:i], rounding) +ENDFOR + + SSE4.1 - Convert +
smmintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := FLOOR(a[i+63:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CEIL(a[i+63:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". + [round_note] + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ROUND(a[i+31:i], rounding) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := FLOOR(a[i+31:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := CEIL(a[i+31:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + + Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := ROUND(b[63:0], rounding) +dst[127:64] := a[127:64] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := FLOOR(b[63:0]) +dst[127:64] := a[127:64] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := CEIL(b[63:0]) +dst[127:64] := a[127:64] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + + Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := ROUND(b[31:0], rounding) +dst[127:32] := a[127:32] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := FLOOR(b[31:0]) +dst[127:32] := a[127:32] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := CEIL(b[31:0]) +dst[127:32] := a[127:32] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ Miscellaneous - - - + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". dst[15:0] := SaturateU16(a[31:0]) @@ -144794,32 +166145,14 @@ dst[95:80] := SaturateU16(b[63:32]) dst[111:96] := SaturateU16(b[95:64]) dst[127:112] := SaturateU16(b[127:96]) - -
smmintrin.h
-
- - Integer + SSE4.1 - Compare - - - - Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - -
smmintrin.h
-
- - Integer - SSE4.1 Convert - - + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". FOR j := 0 to 7 @@ -144828,15 +166161,14 @@ FOR j := 0 to 7 dst[l+15:l] := SignExtend16(a[i+7:i]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 3 @@ -144845,15 +166177,14 @@ FOR j := 0 to 3 dst[i+31:i] := SignExtend32(a[k+7:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". FOR j := 0 to 1 @@ -144862,15 +166193,14 @@ FOR j := 0 to 1 dst[i+63:i] := SignExtend64(a[k+7:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 3 @@ -144879,15 +166209,14 @@ FOR j := 0 to 3 dst[i+31:i] := SignExtend32(a[k+15:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". FOR j := 0 to 1 @@ -144896,15 +166225,14 @@ FOR j := 0 to 1 dst[i+63:i] := SignExtend64(a[k+15:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". FOR j := 0 to 1 @@ -144913,15 +166241,14 @@ FOR j := 0 to 1 dst[i+63:i] := SignExtend64(a[k+31:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". FOR j := 0 to 7 @@ -144930,15 +166257,14 @@ FOR j := 0 to 7 dst[l+15:l] := ZeroExtend16(a[i+7:i]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 3 @@ -144947,15 +166273,14 @@ FOR j := 0 to 3 dst[i+31:i] := ZeroExtend32(a[k+7:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". FOR j := 0 to 1 @@ -144964,15 +166289,14 @@ FOR j := 0 to 1 dst[i+63:i] := ZeroExtend64(a[k+7:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". FOR j := 0 to 3 @@ -144981,15 +166305,14 @@ FOR j := 0 to 3 dst[i+31:i] := ZeroExtend32(a[k+15:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". FOR j := 0 to 1 @@ -144998,15 +166321,14 @@ FOR j := 0 to 1 dst[i+63:i] := ZeroExtend64(a[k+15:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 +
smmintrin.h
Convert - - +
+ + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". FOR j := 0 to 1 @@ -145015,52 +166337,31 @@ FOR j := 0 to 1 dst[i+63:i] := ZeroExtend64(a[k+31:k]) ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 - Arithmetic - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". +
smmintrin.h
+ Convert +
+ + + + + Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". FOR j := 0 to 1 i := j*64 - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
smmintrin.h
-
- - Integer + SSE4.1 - Arithmetic - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*32 - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] -ENDFOR - -
smmintrin.h
-
- - Integer - Flag - SSE4.1 - Logical - - - + Compare + + + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. IF ((a[127:0] AND b[127:0]) == 0) @@ -145075,17 +166376,15 @@ ELSE FI RETURN ZF - -
smmintrin.h
-
- - Integer - Flag + SSE4.1 +
smmintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. IF ((a[127:0] AND b[127:0]) == 0) @@ -145100,17 +166399,15 @@ ELSE FI RETURN CF - -
smmintrin.h
-
- - Integer - Flag + SSE4.1 +
smmintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. IF ((a[127:0] AND b[127:0]) == 0) @@ -145129,17 +166426,15 @@ ELSE dst := 0 FI - -
smmintrin.h
-
- - Integer - Flag + SSE4.1 +
smmintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and return 1 if the result is zero, otherwise return 0. IF ((a[127:0] AND mask[127:0]) == 0) @@ -145149,17 +166444,15 @@ ELSE FI dst := ZF - -
smmintrin.h
-
- - Integer - Flag + SSE4.1 +
smmintrin.h
Logical - - - +
+ + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "mask", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. IF ((a[127:0] AND mask[127:0]) == 0) @@ -145178,16 +166471,14 @@ ELSE dst := 0 FI - -
smmintrin.h
-
- - Integer - Flag + SSE4.1 +
smmintrin.h
Logical - - +
+ + + Compute the bitwise NOT of "a" and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0. FOR j := 0 to 127 @@ -145200,210 +166491,15 @@ ELSE FI dst := CF - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - - Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". - [round_note] - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i], rounding) -ENDFOR - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) -ENDFOR - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) -ENDFOR - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - - Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". - [round_note] - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ROUND(a[i+31:i], rounding) -ENDFOR - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) -ENDFOR - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) -ENDFOR - - -
smmintrin.h
-
- - Floating Point + + SSE4.1 - Special Math Functions - - - - - Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := ROUND(b[63:0], rounding) -dst[127:64] := a[127:64] - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - - Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := FLOOR(b[63:0]) -dst[127:64] := a[127:64] - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - - Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := CEIL(b[63:0]) -dst[127:64] := a[127:64] - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - - - Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := ROUND(b[31:0], rounding) -dst[127:32] := a[127:32] - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - - Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := FLOOR(b[31:0]) -dst[127:32] := a[127:32] - - -
smmintrin.h
-
- - Floating Point - SSE4.1 - Special Math Functions - - - - Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := CEIL(b[31:0]) -dst[127:32] := a[127:32] - - -
smmintrin.h
-
- - Integer - SSE4.1 - Miscellaneous - - + Logical + + + + Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst". index[2:0] := 0 @@ -145419,59 +166515,31 @@ dst[15:0] := min[15:0] dst[18:16] := index[2:0] dst[127:19] := 0 - -
smmintrin.h
-
- - Integer + SSE4.1 - Arithmetic - Miscellaneous - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". - Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". - -DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) { - a_offset := imm8[2]*32 - b_offset := imm8[1:0]*32 - FOR j := 0 to 7 - i := j*8 - k := a_offset+i - l := b_offset - tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \ - ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24])) - ENDFOR - RETURN tmp[127:0] -} -dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0]) - -
smmintrin.h
-
- - Integer - SSE4.1 - Load - - + Miscellaneous + + + + Load 128-bits of integer data from memory into "dst" using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. dst[127:0] := MEM[mem_addr+127:mem_addr] - + + SSE4.1
smmintrin.h
-
- - SSE4.2 - String Compare - - - - + Load + + + + + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated mask in "dst". [strcmp_note] @@ -145599,17 +166667,16 @@ ELSE // bit mask dst[127:UpperBound+1] := 0 FI - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - +
+ + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated index in "dst". [strcmp_note] @@ -145739,17 +166806,16 @@ ELSE // least significant bit OD FI - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - +
+ + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise. [strcmp_note] @@ -145764,17 +166830,16 @@ FOR j := 0 to UpperBound ENDFOR dst := bInvalid - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - +
+ + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise. [strcmp_note] @@ -145890,17 +166955,16 @@ ENDFOR // output dst := (IntRes2 != 0) - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - +
+ + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise. [strcmp_note] @@ -145915,17 +166979,16 @@ FOR i := 0 to UpperBound ENDFOR dst := aInvalid - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - +
+ + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns bit 0 of the resulting bit mask. [strcmp_note] @@ -146041,17 +167104,16 @@ ENDFOR // output dst := IntRes2[0] - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - +
+ + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise. [strcmp_note] @@ -146167,18 +167229,18 @@ ENDFOR // output dst := (IntRes2 == 0) AND bInvalid - -
nmmintrin.h
-
- + SSE4.2 +
nmmintrin.h
String Compare - - - - - - +
+ + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated mask in "dst". [strcmp_note] @@ -146302,19 +167364,18 @@ ELSE // bit mask dst[127:UpperBound+1] := 0 FI - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - - - +
+ + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated index in "dst". [strcmp_note] @@ -146440,19 +167501,18 @@ ELSE // least significant bit OD FI - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - - - +
+ + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise. [strcmp_note] @@ -146460,19 +167520,18 @@ size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 dst := (lb <= UpperBound) - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - - - +
+ + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise. [strcmp_note] @@ -146584,19 +167643,18 @@ ENDFOR // output dst := (IntRes2 != 0) - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - - - +
+ + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise. [strcmp_note] @@ -146604,19 +167662,18 @@ size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters UpperBound := (128 / size) - 1 dst := (la <= UpperBound) - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - - - +
+ + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns bit 0 of the resulting bit mask. [strcmp_note] @@ -146728,19 +167785,18 @@ ENDFOR // output dst := IntRes2[0] - -
nmmintrin.h
-
- - Flag + SSE4.2 +
nmmintrin.h
String Compare - - - - - - +
+ + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise. [strcmp_note] @@ -146852,16 +167908,15 @@ ENDFOR // output dst := (IntRes2 == 0) AND (lb > UpperBound) - -
nmmintrin.h
-
- - Integer + SSE4.2 - Compare - - - +
nmmintrin.h
+ String Compare +
+ + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". FOR j := 0 to 1 @@ -146869,16 +167924,15 @@ FOR j := 0 to 1 dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 ENDFOR - -
nmmintrin.h
-
- - Integer + SSE4.2 - Cryptography - - - +
nmmintrin.h
+ Compare +
+ + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst". tmp1[7:0] := v[0:7] // bit reflection tmp2[31:0] := crc[0:31] // bit reflection @@ -146888,16 +167942,15 @@ tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0] tmp6[31:0] := MOD2(tmp5[39:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 dst[31:0] := tmp6[0:31] // bit reflection - -
nmmintrin.h
-
- - Integer + SSE4.2 +
nmmintrin.h
Cryptography - - - +
+ + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst". tmp1[15:0] := v[0:15] // bit reflection tmp2[31:0] := crc[0:31] // bit reflection @@ -146907,16 +167960,15 @@ tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0] tmp6[31:0] := MOD2(tmp5[47:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 dst[31:0] := tmp6[0:31] // bit reflection - -
nmmintrin.h
-
- - Integer + SSE4.2 +
nmmintrin.h
Cryptography - - - +
+ + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst". tmp1[31:0] := v[0:31] // bit reflection tmp2[31:0] := crc[0:31] // bit reflection @@ -146926,16 +167978,15 @@ tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0] tmp6[31:0] := MOD2(tmp5[63:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 dst[31:0] := tmp6[0:31] // bit reflection - -
nmmintrin.h
-
- - Integer + SSE4.2 +
nmmintrin.h
Cryptography - - - +
+ + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst". tmp1[63:0] := v[0:63] // bit reflection tmp2[31:0] := crc[0:31] // bit reflection @@ -146945,15 +167996,16 @@ tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0] tmp6[31:0] := MOD2(tmp5[95:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 dst[31:0] := tmp6[0:31] // bit reflection - + + SSE4.2
nmmintrin.h
-
- - Integer - SSSE3 - Special Math Functions - - + Cryptography + + + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". FOR j := 0 to 7 @@ -146961,15 +168013,14 @@ FOR j := 0 to 7 dst[i+7:i] := ABS(Int(a[i+7:i])) ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Special Math Functions - - +
+ + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". FOR j := 0 to 15 @@ -146977,15 +168028,14 @@ FOR j := 0 to 15 dst[i+7:i] := ABS(a[i+7:i]) ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Special Math Functions - - +
+ + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". FOR j := 0 to 3 @@ -146993,15 +168043,14 @@ FOR j := 0 to 3 dst[i+15:i] := ABS(Int(a[i+15:i])) ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Special Math Functions - - +
+ + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". FOR j := 0 to 7 @@ -147009,15 +168058,14 @@ FOR j := 0 to 7 dst[i+15:i] := ABS(a[i+15:i]) ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Special Math Functions - - +
+ + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". FOR j := 0 to 1 @@ -147025,15 +168073,14 @@ FOR j := 0 to 1 dst[i+31:i] := ABS(a[i+31:i]) ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Special Math Functions - - +
+ + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". FOR j := 0 to 3 @@ -147041,16 +168088,15 @@ FOR j := 0 to 3 dst[i+31:i] := ABS(a[i+31:i]) ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 - Swizzle - - - +
tmmintrin.h
+ Special Math Functions +
+ + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". FOR j := 0 to 15 @@ -147063,16 +168109,15 @@ FOR j := 0 to 15 FI ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Swizzle - - - +
+ + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". FOR j := 0 to 7 @@ -147085,48 +168130,45 @@ FOR j := 0 to 7 FI ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 - Miscellaneous - - - - +
tmmintrin.h
+ Swizzle +
+ + + + + Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) dst[127:0] := tmp[127:0] - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Miscellaneous - - - - +
+ + + + + Concatenate 8-byte blocks in "a" and "b" into a 16-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) dst[63:0] := tmp[63:0] - -
tmmintrin.h
-
- - Integer + SSSE3 - Arithmetic - - - +
tmmintrin.h
+ Miscellaneous +
+ + + + Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". dst[15:0] := a[31:16] + a[15:0] @@ -147138,16 +168180,15 @@ dst[95:80] := b[63:48] + b[47:32] dst[111:96] := b[95:80] + b[79:64] dst[127:112] := b[127:112] + b[111:96] - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". dst[15:0] := Saturate16(a[31:16] + a[15:0]) @@ -147159,16 +168200,15 @@ dst[95:80] := Saturate16(b[63:48] + b[47:32]) dst[111:96] := Saturate16(b[95:80] + b[79:64]) dst[127:112] := Saturate16(b[127:112] + b[111:96]) - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". dst[31:0] := a[63:32] + a[31:0] @@ -147176,16 +168216,15 @@ dst[63:32] := a[127:96] + a[95:64] dst[95:64] := b[63:32] + b[31:0] dst[127:96] := b[127:96] + b[95:64] - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". dst[15:0] := a[31:16] + a[15:0] @@ -147193,31 +168232,29 @@ dst[31:16] := a[63:48] + a[47:32] dst[47:32] := b[31:16] + b[15:0] dst[63:48] := b[63:48] + b[47:32] - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". dst[31:0] := a[63:32] + a[31:0] dst[63:32] := b[63:32] + b[31:0] - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". dst[15:0] := Saturate16(a[31:16] + a[15:0]) @@ -147225,16 +168262,15 @@ dst[31:16] := Saturate16(a[63:48] + a[47:32]) dst[47:32] := Saturate16(b[31:16] + b[15:0]) dst[63:48] := Saturate16(b[63:48] + b[47:32]) - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". dst[15:0] := a[15:0] - a[31:16] @@ -147246,16 +168282,15 @@ dst[95:80] := b[47:32] - b[63:48] dst[111:96] := b[79:64] - b[95:80] dst[127:112] := b[111:96] - b[127:112] - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". dst[15:0] := Saturate16(a[15:0] - a[31:16]) @@ -147267,16 +168302,15 @@ dst[95:80] := Saturate16(b[47:32] - b[63:48]) dst[111:96] := Saturate16(b[79:64] - b[95:80]) dst[127:112] := Saturate16(b[111:96] - b[127:112]) - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". dst[31:0] := a[31:0] - a[63:32] @@ -147284,16 +168318,15 @@ dst[63:32] := a[95:64] - a[127:96] dst[95:64] := b[31:0] - b[63:32] dst[127:96] := b[95:64] - b[127:96] - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". dst[15:0] := a[15:0] - a[31:16] @@ -147301,31 +168334,29 @@ dst[31:16] := a[47:32] - a[63:48] dst[47:32] := b[15:0] - b[31:16] dst[63:48] := b[47:32] - b[63:48] - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". dst[31:0] := a[31:0] - a[63:32] dst[63:32] := b[31:0] - b[63:32] - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". dst[15:0] := Saturate16(a[15:0] - a[31:16]) @@ -147333,16 +168364,15 @@ dst[31:16] := Saturate16(a[47:32] - a[63:48]) dst[47:32] := Saturate16(b[15:0] - b[31:16]) dst[63:48] := Saturate16(b[47:32] - b[63:48]) - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". FOR j := 0 to 7 @@ -147350,16 +168380,15 @@ FOR j := 0 to 7 dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". FOR j := 0 to 3 @@ -147367,16 +168396,15 @@ FOR j := 0 to 3 dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". FOR j := 0 to 7 @@ -147385,16 +168413,15 @@ FOR j := 0 to 7 dst[i+15:i] := tmp[16:1] ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". FOR j := 0 to 3 @@ -147403,16 +168430,15 @@ FOR j := 0 to 3 dst[i+15:i] := tmp[16:1] ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. FOR j := 0 to 15 @@ -147426,16 +168452,15 @@ FOR j := 0 to 15 FI ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. FOR j := 0 to 7 @@ -147449,16 +168474,15 @@ FOR j := 0 to 7 FI ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. FOR j := 0 to 3 @@ -147472,16 +168496,15 @@ FOR j := 0 to 3 FI ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. FOR j := 0 to 7 @@ -147495,16 +168518,15 @@ FOR j := 0 to 7 FI ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. FOR j := 0 to 3 @@ -147518,16 +168540,15 @@ FOR j := 0 to 3 FI ENDFOR - -
tmmintrin.h
-
- - Integer + SSSE3 +
tmmintrin.h
Arithmetic - - - +
+ + + + Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. FOR j := 0 to 1 @@ -147541,44 +168562,85 @@ FOR j := 0 to 1 FI ENDFOR - + + SSSE3
tmmintrin.h
-
- - TSC - General Support - + Arithmetic + + + + + Copy the current 64-bit value of the processor's time-stamp counter into "dst". dst[63:0] := TimeStampCounter + TSC
immintrin.h
-
- - TSXLDTRK - Miscellaneous + General Support + + + + Mark the start of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a transactional region, subsequent loads are not added to the read set of the transaction. If this is used inside a suspend load address tracking region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP. -
immintrin.h
-
- TSXLDTRK +
immintrin.h
Miscellaneous +
+ Mark the end of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a suspend load address tracking region it will end the suspend region and all following load addresses will be added to the transaction read set. If this is used inside an active transaction but not in a suspend region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP. + TSXLDTRK
immintrin.h
-
- - Integer - AVX512VL - VAES - Cryptography - - - + Miscellaneous + + + + + + + Clear the user interrupt flag (UIF). + + UINTR +
immintrin.h
+ General Support +
+ + + + Send user interprocessor interrupts specified in unsigned 64-bit integer "__a". + + UINTR +
immintrin.h
+ General Support +
+ + + + Sets the user interrupt flag (UIF). + + UINTR +
immintrin.h
+ General Support +
+ + + + Store the current user interrupt flag (UIF) in unsigned 8-bit integer "dst". + + UINTR +
immintrin.h
+ General Support +
+ + + + + Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." FOR j := 0 to 1 i := j*128 @@ -147588,17 +168650,16 @@ ENDFOR ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + VAES + AVX512VL +
immintrin.h
Cryptography - - - +
+ + + + Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." FOR j := 0 to 1 i := j*128 @@ -147609,17 +168670,16 @@ dst[MAX:256] := 0 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + VAES + AVX512VL +
immintrin.h
Cryptography - - - +
+ + + + Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". FOR j := 0 to 1 i := j*128 @@ -147629,17 +168689,16 @@ dst[MAX:256] := 0 ENDFOR dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + VAES + AVX512VL +
immintrin.h
Cryptography - - - +
+ + + + Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". FOR j := 0 to 1 i := j*128 @@ -147650,17 +168709,19 @@ dst[MAX:256] := 0 ENDFOR dst[MAX:256] := 0 - + + VAES + AVX512VL
immintrin.h
-
- - Integer - VPCLMULQDQ - Application-Targeted - - - - + Cryptography + + + + + + + + Carry-less multiplication of one quadword of 'b' by one quadword of 'c', stores the 128-bit result in 'dst'. The immediate 'Imm8' is @@ -147685,7 +168746,7 @@ DEFINE PCLMUL128(X,Y) { DEST[127] := 0 RETURN DEST // 128b vector } -FOR i := 0 to 3 +FOR i := 0 to 1 IF Imm8[0] == 0 TEMP1 := b.m128[i].qword[0] ELSE @@ -147698,20 +168759,21 @@ FOR i := 0 to 3 FI dst.m128[i] := PCLMUL128(TEMP1, TEMP2) ENDFOR -dst[MAX:512] := 0 +dst[MAX:256] := 0 - -
immintrin.h
-
- - Integer - AVX512VL + VPCLMULQDQ + AVX512VL +
immintrin.h
Application-Targeted - - - - +
+ + + + + + + Carry-less multiplication of one quadword of 'b' by one quadword of 'c', stores the 128-bit result in 'dst'. The immediate 'Imm8' is @@ -147736,7 +168798,7 @@ DEFINE PCLMUL128(X,Y) { DEST[127] := 0 RETURN DEST // 128b vector } -FOR i := 0 to 1 +FOR i := 0 to 3 IF Imm8[0] == 0 TEMP1 := b.m128[i].qword[0] ELSE @@ -147749,36 +168811,36 @@ FOR i := 0 to 1 FI dst.m128[i] := PCLMUL128(TEMP1, TEMP2) ENDFOR -dst[MAX:256] := 0 +dst[MAX:512] := 0 - + + VPCLMULQDQ
immintrin.h
-
- - Flag + Application-Targeted + + + + + + + + Directs the processor to enter an implementation-dependent optimized state until the TSC reaches or exceeds the value specified in "counter". Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). If the processor that executed a UMWAIT instruction wakes due to the expiration of the operating system timelimit, the instructions sets RFLAGS.CF; otherwise, that flag is cleared. + WAITPKG - Miscellaneous - - - - Directs the processor to enter an implementation-dependent optimized state until the TSC reaches or exceeds the value specified in "counter". Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). -
immintrin.h
-
- - Flag - WAITPKG Miscellaneous - - - - Directs the processor to enter an implementation-dependent optimized state while monitoring a range of addresses. The instruction wakes up when the TSC reaches or exceeds the value specified in "counter" (if the monitoring hardware did not trigger beforehand). Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). - -
immintrin.h
-
- + + + + + + Directs the processor to enter an implementation-dependent optimized state while monitoring a range of addresses. The instruction wakes up when the TSC reaches or exceeds the value specified in "counter" (if the monitoring hardware did not trigger beforehand). Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). If the processor that executed a UMWAIT instruction wakes due to the expiration of the operating system timelimit, the instructions sets RFLAGS.CF; otherwise, that flag is cleared. + WAITPKG +
immintrin.h
Miscellaneous +
+ Sets up a linear address range to be @@ -147786,96 +168848,31 @@ dst[MAX:256] := 0 monitor. The address range should be a writeback memory caching type. The address is contained in "a". - + + WAITPKG
immintrin.h
-
- - WBNOINVD Miscellaneous + + + + Write back and do not flush internal caches. Initiate writing-back without flushing of external caches. + WBNOINVD
immintrin.h
-
- - XSAVE - OS-Targeted - - - Copy up to 64-bits from the value of the extended control register (XCR) specified by "a" into "dst". Currently only XFEATURE_ENABLED_MASK XCR is supported. - dst[63:0] := XCR[a] - - -
immintrin.h
-
- - XSAVE - OS-Targeted - - - - Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. - st_mask := mem_addr.HEADER.XSTATE_BV[62:0] -FOR i := 0 to 62 - IF (rs_mask[i] AND XCR0[i]) - IF st_mask[i] - CASE (i) OF - 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] - 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] - DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] - ESAC - ELSE - // ProcessorExtendedState := Processor Supplied Values - CASE (i) OF - 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] - ESAC - FI - FI - i := i + 1 -ENDFOR - - -
immintrin.h
-
- - XSAVE - OS-Targeted - - - - Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. - st_mask := mem_addr.HEADER.XSTATE_BV[62:0] -FOR i := 0 to 62 - IF (rs_mask[i] AND XCR0[i]) - IF st_mask[i] - CASE (i) OF - 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] - 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] - DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] - ESAC - ELSE - // ProcessorExtendedState := Processor Supplied Values - CASE (i) OF - 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] - ESAC - FI - FI - i := i + 1 -ENDFOR - - -
immintrin.h
-
- - XSAVE - OS-Targeted + Miscellaneous + + + + - - Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + + Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. mask[62:0] := save_mask[62:0] AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] @@ -147889,16 +168886,17 @@ FOR i := 0 to 62 i := i + 1 ENDFOR - -
immintrin.h
-
- + XSAVE + XSAVEC +
immintrin.h
OS-Targeted +
+ - - Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + + Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. mask[62:0] := save_mask[62:0] AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] @@ -147912,16 +168910,18 @@ FOR i := 0 to 62 i := i + 1 ENDFOR - -
immintrin.h
-
- + XSAVE - XSAVEOPT + XSAVEC +
immintrin.h
OS-Targeted +
+ + + - + Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE instruction. mask[62:0] := save_mask[62:0] AND XCR0[62:0] FOR i := 0 to 62 @@ -147937,16 +168937,16 @@ FOR i := 0 to 62 i := i + 1 ENDFOR - -
immintrin.h
-
- + XSAVE XSAVEOPT +
immintrin.h
OS-Targeted +
+ - + Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE64 instruction. mask[62:0] := save_mask[62:0] AND XCR0[62:0] FOR i := 0 to 62 @@ -147962,30 +168962,19 @@ FOR i := 0 to 62 i := i + 1 ENDFOR - -
immintrin.h
-
- + XSAVE - OS-Targeted - - - - Copy 64-bits from "val" to the extended control register (XCR) specified by "a". Currently only XFEATURE_ENABLED_MASK XCR is supported. - -XCR[a] := val[63:0] - - + XSAVEOPT
immintrin.h
-
- - XSAVE - XSAVEC OS-Targeted + + + + - - Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + + Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. mask[62:0] := save_mask[62:0] AND XCR0[62:0] FOR i := 0 to 62 IF mask[i] @@ -147999,16 +168988,16 @@ FOR i := 0 to 62 i := i + 1 ENDFOR - -
immintrin.h
-
- + XSAVE XSS +
immintrin.h
OS-Targeted +
+ - + Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. mask[62:0] := save_mask[62:0] AND XCR0[62:0] FOR i := 0 to 62 @@ -148023,65 +169012,90 @@ FOR i := 0 to 62 i := i + 1 ENDFOR - -
immintrin.h
-
- + XSAVE - XSAVEC + XSS +
immintrin.h
OS-Targeted +
+ - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] + + + Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. + st_mask := mem_addr.HEADER.XSTATE_BV[62:0] FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + IF (rs_mask[i] AND XCR0[i]) + IF st_mask[i] + CASE (i) OF + 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] + 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] + DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] + ESAC + ELSE + // ProcessorExtendedState := Processor Supplied Values + CASE (i) OF + 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] + ESAC + FI FI i := i + 1 ENDFOR - -
immintrin.h
-
- + XSAVE XSS +
immintrin.h
OS-Targeted +
+ - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] + + + Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. + st_mask := mem_addr.HEADER.XSTATE_BV[62:0] FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + IF (rs_mask[i] AND XCR0[i]) + IF st_mask[i] + CASE (i) OF + 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] + 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] + DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] + ESAC + ELSE + // ProcessorExtendedState := Processor Supplied Values + CASE (i) OF + 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] + ESAC + FI FI i := i + 1 ENDFOR - -
immintrin.h
-
- + XSAVE XSS +
immintrin.h
+ OS-Targeted +
+ + + + + + Copy up to 64-bits from the value of the extended control register (XCR) specified by "a" into "dst". Currently only XFEATURE_ENABLED_MASK XCR is supported. + dst[63:0] := XCR[a] + + + XSAVE +
immintrin.h
OS-Targeted +
+ - - - Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. + + + Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. st_mask := mem_addr.HEADER.XSTATE_BV[62:0] FOR i := 0 to 62 IF (rs_mask[i] AND XCR0[i]) @@ -148101,17 +169115,16 @@ FOR i := 0 to 62 i := i + 1 ENDFOR - -
immintrin.h
-
- + XSAVE - XSS +
immintrin.h
OS-Targeted +
+ - - - Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. + + + Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. st_mask := mem_addr.HEADER.XSTATE_BV[62:0] FOR i := 0 to 62 IF (rs_mask[i] AND XCR0[i]) @@ -148131,7 +169144,70 @@ FOR i := 0 to 62 i := i + 1 ENDFOR - + + XSAVE
immintrin.h
-
+ OS-Targeted + + + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE +
immintrin.h
+ OS-Targeted +
+ + + + + Copy 64-bits from "val" to the extended control register (XCR) specified by "a". Currently only XFEATURE_ENABLED_MASK XCR is supported. + +XCR[a] := val[63:0] + + + XSAVE +
immintrin.h
+ OS-Targeted +
+ + \ No newline at end of file