Permalink
Browse files

Added Read4_Write4_SSE and sse.h

  • Loading branch information...
pzemtsov committed Jun 4, 2014
1 parent e38a2ae commit 8ab2ef65d84929bf0f7b7570018870995f939b8e
Showing with 105 additions and 0 deletions.
  1. +33 −0 e1-new.cpp
  2. +72 −0 sse.h
View
@@ -2,6 +2,7 @@
Revision 2: Added Write8
Revision 3: Added Read4_Write4
Revision 4: Unrolled Read4_Write4
Revision 5: Added Read4_Write4_SSE
*/
#include <cassert>
@@ -13,6 +14,7 @@
#include "timer.h"
#include "mymacros.h"
#include "sse.h"
typedef unsigned char byte;
@@ -204,6 +206,36 @@ class Read4_Write4_Unroll : public Demux
}
};
class Read4_Write4_SSE : public Demux
{
public:
void demux (const byte * src, size_t src_length, byte ** dst) const
{
assert (src_length == NUM_TIMESLOTS * DST_SIZE);
assert (DST_SIZE % 4 == 0);
assert (NUM_TIMESLOTS % 4 == 0);
for (size_t dst_num = 0; dst_num < NUM_TIMESLOTS; dst_num += 4) {
byte * d0 = dst [dst_num + 0];
byte * d1 = dst [dst_num + 1];
byte * d2 = dst [dst_num + 2];
byte * d3 = dst [dst_num + 3];
for (size_t dst_pos = 0; dst_pos < DST_SIZE; dst_pos += 4) {
uint32_t w0 = * (uint32_t*) &src [(dst_pos + 0) * NUM_TIMESLOTS + dst_num];
uint32_t w1 = * (uint32_t*) &src [(dst_pos + 1) * NUM_TIMESLOTS + dst_num];
uint32_t w2 = * (uint32_t*) &src [(dst_pos + 2) * NUM_TIMESLOTS + dst_num];
uint32_t w3 = * (uint32_t*) &src [(dst_pos + 3) * NUM_TIMESLOTS + dst_num];
__m128i m = _mm_setr_epi32 (w0, w1, w2, w3);
m = transpose_4x4 (m);
* (uint32_t*) &d0 [dst_pos] = (uint32_t) _mm_extract_epi32 (m, 0);
* (uint32_t*) &d1 [dst_pos] = (uint32_t) _mm_extract_epi32 (m, 1);
* (uint32_t*) &d2 [dst_pos] = (uint32_t) _mm_extract_epi32 (m, 2);
* (uint32_t*) &d3 [dst_pos] = (uint32_t) _mm_extract_epi32 (m, 3);
}
}
}
};
byte * generate ()
{
byte * buf = new byte [SRC_SIZE];
@@ -274,6 +306,7 @@ int main (void)
measure (Write8 ());
measure (Read4_Write4 ());
measure (Read4_Write4_Unroll ());
measure (Read4_Write4_SSE ());
return 0;
}
View
72 sse.h
@@ -0,0 +1,72 @@
#include <xmmintrin.h>
#include <pmmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>
#include <emmintrin.h>
/** Many functions here are defined as macros. The reason for this is that the SSE/AVX shuffle/permute instructions
* require compile-time constant arguments, and there is no way to provide such requirements in C
* (or, rather, I don't know of such a way; perhaps, something is possible with templates)
* Even if the function is inline and is called with constant arguments, compiler still complains when this
* function calls intrinsics with its parameters. Macros help work it around, however we pay for it with
* lack of type checking
*/
/** Combine together two fields of 4 bits each, in lower to high order.
* Used in permute2f128
* @param n0 constant integer value of size 4 bits (not checked)
* @param n1 constant integer value of size 4 bits (not checked)
* @return combined 8-bit value where lower 4 bits contain n0 and higher 4 bits contain n1 (format used by permute2f128/VPERM2F128)
*/
#define combine_2_4bits(n0, n1) (n0 + (n1<<4))
/** Combine together four fields of 2 bits each, in lower to high order.
* Used in 128 and 256 bits shuffles and permutations
* @param n0 constant integer value of size 2 bits (not checked)
* @param n1 constant integer value of size 2 bits (not checked)
* @param n2 constant integer value of size 2 bits (not checked)
* @param n3 constant integer value of size 2 bits (not checked) (guys, was it really so necessary to write these comments?)
* @return combined 8-bit value where lower 2 bits contain n0 and high 2 bits contain n3 (format used by __mm_shuffle_ps/SHUFPS)
*/
#define combine_4_2bits(n0, n1, n2, n3) (n0 + (n1<<2) + (n2<<4) + (n3<<6))
// ------ General shuffles and permutations
/** shuffles two 128-bit registers according to four 2-bit constants defining positions.
* @param x A0 A1 A2 A3 (each element a 32-bit float)
* @param y C0 C1 C2 C3 (each element a 32-bit float)
* @return A[n0] A[n1] C[n2] C[n3]
* Note that positions 0, 1 are only filled with data from x, positions 2, 3 only with data from y.
* Components of a single vector can be shuffled in any order by using this function with x and inself
* (see __mm_shuffle_ps intrinsic and SHUFPS instruction)
*/
#define _128_shuffle(x, y, n0, n1, n2, n3) _mm_shuffle_ps(x, y, combine_4_2bits (n0, n1, n2, n3))
/** shuffles two 128-bit integer registers according to four 2-bit constants defining positions.
* @param x A0 A1 A2 A3 (each element a 32-bit float)
* @param y C0 C1 C2 C3 (each element a 32-bit float)
* @return A[n0] A[n1] C[n2] C[n3]
* Note that positions 0, 1 are only filled with data from x, positions 2, 3 only with data from y.
* Components of a single vector can be shuffled in any order by using this function with x and inself
* (see __mm_shuffle_ps intrinsic and SHUFPS instruction)
*/
#define _128i_shuffle(x, y, n0, n1, n2, n3) _mm_castps_si128(_128_shuffle(_mm_castsi128_ps(x), _mm_castsi128_ps(y), n0, n1, n2, n3))
// ------ More specific permutations
/** transposes a 4x4 byte matrix stored in a 128-bit register
* @param m a source matrix, stored as rows, containing 16 byte values:
* m00 m01 m02 m03
* m10 m11 m12 m13
* m20 m21 m22 m23
* m30 m31 m32 m33
* @return a transposed matrix, stored as rows, containing values:
* m00 m10 m20 m30
* m01 m11 m21 m31
* m02 m12 m22 m32
* m03 m13 m23 m33
*/
inline __m128i transpose_4x4 (__m128i m)
{
return _mm_shuffle_epi8 (m, _mm_setr_epi8 (0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));
}

0 comments on commit 8ab2ef6

Please sign in to comment.