Permalink
Fetching contributors…
Cannot retrieve contributors at this time
1244 lines (904 sloc) 35.1 KB
/*
* Copyright (C) 2009-2012 Simon A. Berger
*
* This file is part of papara.
*
* papara is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* papara is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with papara. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __vec_unit_h
#define __vec_unit_h
#include <iostream>
#include <stdexcept>
#include <cassert>
#include <cstdio>
#include <stdint.h>
// the convenient x86intrin.h is not available on ancient gcc/msvc, so pull in the best manually.
// sse3 is the absolute baseline. sssssssse3 and sse4.666 are nice to have (abs and 32bit min)
// and AVX is a completely different beast...
#ifdef __MMX__
#include <mmintrin.h>
#endif
#ifdef __SSE__
#include <xmmintrin.h>
#endif
#ifdef __SSE2__
#include <emmintrin.h>
#endif
#ifdef __SSE3__
#include <pmmintrin.h>
#endif
#ifdef __SSE4A__
#include <ammintrin.h>
#endif
#if defined (__SSE4_2__) || defined (__SSE4_1__)
#include <smmintrin.h>
#endif
#ifdef __SSSE3__
#include <tmmintrin.h>
#endif
#ifdef __SSE4A__
#include <ammintrin.h>
#endif
#if defined (__SSE4_2__) || defined (__SSE4_1__)
#include <smmintrin.h>
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
#ifndef _MSC_VER
#include <x86intrin.h>
#endif
#ifdef __AVX__
#define HAVE_AVX
//#include <immintrin.h>
#endif
#ifdef min
#error min defined as macro. this is evil. Please #define NOMINMAX before including any windows headers.
#endif
#ifdef max
#error max defined as macro. this is evil. Please #define NOMINMAX before including any windows headers.
#endif
const size_t required_alignment = 16;
template<class T, size_t W>
struct vector_unit {
};
// vector unit specialization: SSE 8x16bit integer
template<>
struct vector_unit<short, 8> {
const static bool do_checks = false;
typedef __m128i vec_t;
typedef short T;
const static T POS_MAX_VALUE = 0x7fff;
const static T LARGE_VALUE = 32000;
const static T SMALL_VALUE = -32000;
const static T BIAS = 0;
const static size_t W = 8;
static inline vec_t setzero() {
return set1(0);
}
static inline vec_t set1( T val ) {
return _mm_set1_epi16( val );
}
static inline void store( const vec_t &v, T *addr ) {
if( do_checks && addr == 0 ) {
throw std::runtime_error( "store: addr == 0" );
}
//std::cout << "store to: " << addr << "\n";
_mm_store_si128( (vec_t*)addr, v );
}
static inline const vec_t load( const T* addr ) {
return _mm_load_si128( (vec_t*)addr );
}
static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
return _mm_and_si128( a, b );
}
static inline const vec_t bit_or( const vec_t &a, const vec_t &b ) {
return _mm_or_si128( a, b );
}
static inline const vec_t bit_andnot( const vec_t &a, const vec_t &b ) {
return _mm_andnot_si128( a, b );
}
static inline const vec_t bit_invert( const vec_t &a ) {
return _mm_xor_si128( a, set1(T(0xffff)) );
}
// static inline const vec_t bit_invert( const vec_t &a ) {
// //return _mm_andnot_pd(a, set1(0xffff));
// }
static inline const vec_t add( const vec_t &a, const vec_t &b ) {
return _mm_add_epi16( a, b );
}
static inline const vec_t adds( const vec_t &a, const vec_t &b ) {
return _mm_adds_epi16( a, b );
}
static inline const vec_t sub( const vec_t &a, const vec_t &b ) {
return _mm_sub_epi16( a, b );
}
static inline const vec_t cmp_zero( const vec_t &a ) {
return _mm_cmpeq_epi16( a, setzero() );
}
static inline const vec_t cmp_eq( const vec_t &a, const vec_t &b ) {
return _mm_cmpeq_epi16( a, b );
}
static inline const vec_t cmp_lt( const vec_t &a, const vec_t &b ) {
return _mm_cmplt_epi16( a, b );
}
static inline const vec_t min( const vec_t &a, const vec_t &b ) {
return _mm_min_epi16( a, b );
}
static inline const vec_t max( const vec_t &a, const vec_t &b ) {
return _mm_max_epi16( a, b );
}
static inline const vec_t abs_diff( const vec_t &a, const vec_t &b ) {
// i don't really like this function, as ideally this would just be abs(sub(a,b)),
// but there doesn't seem to be a fast way to implement abs on pre SSSSSSE3.
// The max(sub(a,b),sub(b,a)) work-around seems to be the next-best thing in this special case.
#ifdef __SSSE3__
return _mm_abs_epi16(sub(a,b));
#else
// FIXME: is there a faster method for boring old CPUs?
return max( sub(a,b), sub(b,a) );
// #error missing SSSSSSSSSE3
#endif
}
static inline void assert_alignment( T * p ) {
assert( size_t(p) % required_alignment == 0 );
}
};
#ifndef __AVX__
// vector unit specialization: future AVX 16x16bit integer
template<>
struct vector_unit<short, 16> {
const static bool do_checks = false;
struct vec_t {
__m128i l;
__m128i h;
vec_t( const __m128i &ll, const __m128i &hh ) : l(ll), h(hh) {}
};
typedef short T;
const static T POS_MAX_VALUE = 0x7fff;
const static T LARGE_VALUE = 32000;
const static T SMALL_VALUE = -32000;
const static T BIAS = 0;
const static size_t W = 16;
static inline vec_t setzero() {
return set1(0);
}
static inline vec_t set1( T val ) {
return vec_t(_mm_set1_epi16( val ), _mm_set1_epi16( val ));
}
static inline void store( const vec_t &v, T *addr ) {
if( do_checks && addr == 0 ) {
throw std::runtime_error( "store: addr == 0" );
}
//std::cout << "store to: " << addr << "\n";
_mm_store_si128( (__m128i*)addr, v.l );
_mm_store_si128( (__m128i*)(addr + 8), v.h );
}
static inline const vec_t load( const T* addr ) {
return vec_t( _mm_load_si128( (__m128i*)addr ), _mm_load_si128( (__m128i*)(addr + 8 )) );
}
static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
return vec_t(_mm_and_si128( a.l, b.l ), _mm_and_si128( a.h, b.h ));
}
static inline const vec_t bit_or( const vec_t &a, const vec_t &b ) {
return vec_t(_mm_or_si128( a.l, b.l ), _mm_or_si128( a.h, b.h ));
}
static inline const vec_t bit_andnot( const vec_t &a, const vec_t &b ) {
return vec_t( _mm_andnot_si128( a.l, b.l ), _mm_andnot_si128( a.h, b.h ) );
}
static inline const vec_t bit_invert( const vec_t &a ) {
return vec_t( _mm_xor_si128( a.l, _mm_set1_epi16(T(0xffff)) ), _mm_xor_si128( a.h, _mm_set1_epi16(T(0xffff)) ) );
}
// static inline const vec_t bit_invert( const vec_t &a ) {
// //return _mm_andnot_pd(a, set1(0xffff));
// }
static inline const vec_t add( const vec_t &a, const vec_t &b ) {
return vec_t(_mm_add_epi16( a.l, b.l ), _mm_add_epi16( a.h, b.h ));
}
static inline const vec_t adds( const vec_t &a, const vec_t &b ) {
return vec_t(_mm_adds_epi16( a.l, b.l ), _mm_adds_epi16( a.h, b.h ));
}
static inline const vec_t sub( const vec_t &a, const vec_t &b ) {
return vec_t(_mm_sub_epi16( a.l, b.l ),_mm_sub_epi16( a.h, b.h ));
}
static inline const vec_t cmp_zero( const vec_t &a ) {
return vec_t(_mm_cmpeq_epi16( a.l, _mm_setzero_si128() ), _mm_cmpeq_epi16( a.h, _mm_setzero_si128() ) );
}
static inline const vec_t cmp_eq( const vec_t &a, const vec_t &b ) {
return vec_t(_mm_cmpeq_epi16( a.l, b.l ), _mm_cmpeq_epi16( a.h, b.h ) );
}
static inline const vec_t cmp_lt( const vec_t &a, const vec_t &b ) {
return vec_t(_mm_cmplt_epi16( a.l, b.l ), _mm_cmplt_epi16( a.h, b.h ) );
}
static inline const vec_t min( const vec_t &a, const vec_t &b ) {
return vec_t(_mm_min_epi16( a.l, b.l ),_mm_min_epi16( a.h, b.h ));
}
static inline const vec_t max( const vec_t &a, const vec_t &b ) {
return vec_t(_mm_max_epi16( a.l, b.l ), _mm_max_epi16( a.h, b.h ));
}
static inline const vec_t abs_diff( const vec_t &a, const vec_t &b ) {
// i don't really like this function, as ideally this would just be abs(sub(a,b)),
// but there doesn't seem to be a fast way to implement abs on pre SSSSSSE3.
// The max(sub(a,b),sub(b,a)) work-around seems to be the next-best thing in this special case.
#ifdef __SSSE3__
const vec_t &tmp = sub(a,b);
return vec_t(_mm_abs_epi16(tmp.l), _mm_abs_epi16(tmp.h));
#else
// FIXME: is there a faster method for boring old CPUs?
return max( sub(a,b), sub(b,a) );
// #error missing SSSSSSSSSE3
#endif
}
static inline void assert_alignment( T * p ) {
assert( size_t(p) % required_alignment == 0 );
}
};
#endif
template<>
struct vector_unit<int, 4> {
const static bool do_checks = false;
typedef __m128i vec_t;
typedef int T;
const static T POS_MAX_VALUE = 0x7fffffff;
const static T LARGE_VALUE = 2100000000;
const static T SMALL_VALUE = -2100000000;
const static T BIAS = 0;
const static size_t W = 4;
static inline vec_t setzero() {
return set1(0);
}
static inline vec_t set1( T val ) {
return _mm_set1_epi32( val );
}
static inline void store( const vec_t &v, T *addr ) {
if( do_checks && addr == 0 ) {
throw std::runtime_error( "store: addr == 0" );
}
//std::cout << "store to: " << addr << "\n";
_mm_store_si128( (vec_t*)addr, v );
}
static inline const vec_t load( const T* addr ) {
return _mm_load_si128( (vec_t*)addr );
}
static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
return _mm_and_si128( a, b );
}
static inline const vec_t bit_andnot( const vec_t &a, const vec_t &b ) {
return _mm_andnot_si128( a, b );
}
// static inline const vec_t bit_invert( const vec_t &a ) {
// //return _mm_andnot_pd(a, set1(0xffff));
// }
static inline const vec_t add( const vec_t &a, const vec_t &b ) {
return _mm_add_epi32( a, b );
}
static inline const vec_t adds( const vec_t &a, const vec_t &b ) {
// there is no saturating add for 32bit int. Just hope that nothing bad will happen.
// Treat saturation as kind of 'best effort hint'
return _mm_add_epi32( a, b );
}
static inline const vec_t sub( const vec_t &a, const vec_t &b ) {
return _mm_sub_epi32( a, b );
}
static inline const vec_t cmp_zero( const vec_t &a ) {
return _mm_cmpeq_epi32( a, setzero() );
}
static inline const vec_t cmp_eq( const vec_t &a, const vec_t &b ) {
return _mm_cmpeq_epi32( a, b );
}
static inline const vec_t cmp_lt( const vec_t &a, const vec_t &b ) {
return _mm_cmplt_epi32( a, b );
}
static inline const vec_t min( const vec_t &a, const vec_t &b ) {
// sse 4.1, no shit! what were they smoking...
#ifdef __SSE4_1__
return _mm_min_epi32( a, b );
#else
//#warning "probably untested code!"
assert(0);
const vec_t ma = _mm_cmplt_epi32( a, b );
return _mm_or_si128( _mm_and_si128( ma, a ), _mm_andnot_si128( ma, b ) );
#endif
}
static inline const vec_t max( const vec_t &a, const vec_t &b ) {
#ifdef __SSE4_1__
return _mm_max_epi32( a, b );
#else
//#warning "probably untested code!"
const vec_t ma = _mm_cmpgt_epi32( a, b );
const vec_t ret = _mm_or_si128( _mm_and_si128( ma, a ), _mm_andnot_si128( ma, b ) );
#if 0
println( a );
println( b );
println( ret );
assert(0);
#endif
return ret;
#endif
}
static inline void println( const vec_t & v ) {
T tmp[W];
_mm_storeu_si128( (vec_t*)tmp, v );
printf( "%d %d %d %d\n", tmp[0], tmp[1], tmp[2], tmp[3] );
}
static inline const vec_t abs_diff( const vec_t &a, const vec_t &b ) {
// i don't really like this function, as ideally this would just be abs(sub(a,b)),
// but there doesn't seem to be a fast way to implement abs on pre SSSSSSE3.
// The max(sub(a,b),sub(b,a)) work-around seems to be the next-best thing in this special case.
#ifdef __SSSE3__
return _mm_abs_epi32(sub(a,b));
#else
// FIXME: is there a faster method for boring old CPUs?
return max( sub(a,b), sub(b,a) );
// #error missing SSSSSSSSSE3
#endif
}
static inline void assert_alignment( T * p ) {
assert( size_t(p) % required_alignment == 0 );
}
};
#define VEC_UNIT_ENABLE_FLOAT
#ifdef VEC_UNIT_ENABLE_FLOAT
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wgnu"
#endif
template<>
struct vector_unit<float, 4> {
const static bool do_checks = false;
typedef __m128 vec_t;
typedef float T;
const static int SIGN_MASK_INT = 0x7FFFFFFF;
// const static T LARGE_VALUE;
// const static T SMALL_VALUE;
// const static T BIAS;
const static size_t W = 4;
static inline vec_t cast_from_int( const __m128i &iv ) {
return _mm_castsi128_ps( iv );
}
static inline vec_t setzero() {
return set1(0);
}
static inline vec_t set1( T val ) {
return _mm_set1_ps( val );
}
static inline void store( const vec_t &v, T *addr ) {
if( do_checks && addr == 0 ) {
throw std::runtime_error( "store: addr == 0" );
}
//std::cout << "store to: " << addr << "\n";
_mm_store_ps( (T*)addr, v );
}
static inline const vec_t load( const T* addr ) {
return _mm_load_ps( (T*)addr );
}
#if 1
static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
return _mm_and_ps( a, b );
}
static inline const vec_t bit_andnot( const vec_t &a, const vec_t &b ) {
return _mm_andnot_ps( a, b );
}
#endif
// static inline const vec_t bit_invert( const vec_t &a ) {
// //return _mm_andnot_pd(a, set1(0xffff));
// }
static inline const vec_t add( const vec_t &a, const vec_t &b ) {
return _mm_add_ps( a, b );
}
static inline const vec_t mul( const vec_t &a, const vec_t &b ) {
return _mm_mul_ps( a, b );
}
static inline const vec_t adds( const vec_t &a, const vec_t &b ) {
// float add is always saturating, kind of!?
return _mm_add_ps( a, b );
}
static inline const vec_t sub( const vec_t &a, const vec_t &b ) {
return _mm_sub_ps( a, b );
}
static inline const vec_t cmp_zero( const vec_t &a ) {
return _mm_cmpeq_ps( a, setzero() );
}
static inline const vec_t cmp_eq( const vec_t &a, const vec_t &b ) {
// TODO: think about what this really means. Maybe use something epsilon-based as a default for float?
return _mm_cmpeq_ps( a, b );
}
static inline const vec_t cmp_lt( const vec_t &a, const vec_t &b ) {
return _mm_cmplt_ps( a, b );
}
static inline const vec_t min( const vec_t &a, const vec_t &b ) {
return _mm_min_ps( a, b );
}
static inline const vec_t max( const vec_t &a, const vec_t &b ) {
return _mm_max_ps( a, b );
}
static inline const vec_t abs_diff( const vec_t &a, const vec_t &b ) {
// i don't really like this function, as ideally this would just be abs(sub(a,b)),
// but there doesn't seem to be a fast way to implement abs on pre SSSSSSE3.
// The max(sub(a,b),sub(b,a)) work-around seems to be the next-best thing in this special case.
//unsigned int SIGN_MASK[4] = {0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF};
//unsigned int SIGN_MASK = 0x7FFFFFFF;
const float *SIGN_MASK_PTR = (float*)&SIGN_MASK_INT;
static float SIGN_MASK = *SIGN_MASK_PTR;
return bit_and(sub(a,b), set1(SIGN_MASK) ); // TODO: could this case any alignment problems?
//return bit_and(sub(a,b), set1(*((float*)&SIGN_MASK_INT) )); // TODO: could this case any alignment problems?
//return bit_and(sub(a,b), load((float*)SIGN_MASK ));
}
};
template<>
struct vector_unit<double, 2> {
const static bool do_checks = false;
typedef __m128d vec_t;
typedef double T;
// const static uint64_t SIGN_MASK_U64 = 0x7FFFFFFFFFFFFFFF;
// const static T LARGE_VALUE;
// const static T SMALL_VALUE;
// const static T BIAS;
const static size_t W = 2;
static inline vec_t cast_from_int( const __m128i &iv ) {
return _mm_castsi128_pd( iv );
}
static inline vec_t setzero() {
return set1(0);
}
static inline vec_t set1( T val ) {
return _mm_set1_pd( val );
}
static inline void store( const vec_t &v, T *addr ) {
if( do_checks && addr == 0 ) {
throw std::runtime_error( "store: addr == 0" );
}
//std::cout << "store to: " << addr << "\n";
_mm_store_pd( (T*)addr, v );
}
static inline const vec_t load( const T* addr ) {
return _mm_load_pd( (T*)addr );
}
#if 1
static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
return _mm_and_pd( a, b );
}
static inline const vec_t bit_or( const vec_t &a, const vec_t &b ) {
return _mm_or_pd( a, b );
}
static inline const vec_t bit_andnot( const vec_t &a, const vec_t &b ) {
return _mm_andnot_pd( a, b );
}
#endif
// static inline const vec_t bit_invert( const vec_t &a ) {
// //return _mm_andnot_pd(a, set1(0xffff));
// }
static inline const vec_t add( const vec_t &a, const vec_t &b ) {
return _mm_add_pd( a, b );
}
static inline const vec_t mul( const vec_t &a, const vec_t &b ) {
return _mm_mul_pd( a, b );
}
static inline const vec_t adds( const vec_t &a, const vec_t &b ) {
// float add is always saturating, kind of!?
return _mm_add_pd( a, b );
}
static inline const vec_t sub( const vec_t &a, const vec_t &b ) {
return _mm_sub_pd( a, b );
}
static inline const vec_t cmp_zero( const vec_t &a ) {
return _mm_cmpeq_pd( a, setzero() );
}
static inline const vec_t cmp_eq( const vec_t &a, const vec_t &b ) {
// TODO: think about what this really means. Maybe use something epsilon-based as a default for float?
return _mm_cmpeq_pd( a, b );
}
static inline const vec_t cmp_lt( const vec_t &a, const vec_t &b ) {
return _mm_cmplt_pd( a, b );
}
static inline const vec_t min( const vec_t &a, const vec_t &b ) {
return _mm_min_pd( a, b );
}
static inline const vec_t max( const vec_t &a, const vec_t &b ) {
return _mm_max_pd( a, b );
}
// deactivated for now, beacuse of the missing ULL suffix in pre c++11
#if 0
static inline const vec_t abs_diff( const vec_t &a, const vec_t &b ) {
// i don't really like this function, as ideally this would just be abs(sub(a,b)),
// but there doesn't seem to be a fast way to implement abs on pre SSSSSSE3.
// The max(sub(a,b),sub(b,a)) work-around seems to be the next-best thing in this special case.
//unsigned int SIGN_MASK[4] = {0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF};
//unsigned int SIGN_MASK = 0x7FFFFFFF;
const uint64_t SIGN_MASK_U64x = 0x7fffffffffffffffULL;
const double *SIGN_MASK_PTR = (double*)&SIGN_MASK_U64x;
double SIGN_MASK = *SIGN_MASK_PTR;
return bit_and(sub(a,b), set1(SIGN_MASK) ); // TODO: could this cause any alignment problems?
//return bit_and(sub(a,b), set1(*((float*)&SIGN_MASK_INT) )); // TODO: could this case any alignment problems?
//return bit_and(sub(a,b), load((float*)SIGN_MASK ));
}
#endif
static inline const vec_t abs( const vec_t &a ) {
// const uint64_t SIGN_MASK_U64x = 0x7fffffffffffffff;
//
// const static double *SIGN_MASK_PTR = (double*)&SIGN_MASK_U64x;
// static double SIGN_MASK = *SIGN_MASK_PTR;
//
// return bit_and( a, set1(SIGN_MASK));
//
//
return max( a, sub( setzero(), a ));
}
static inline void println( const vec_t & v ) {
double tmp[2];
_mm_storeu_pd( tmp, v );
printf( "%f %f\n", tmp[0], tmp[1] );
}
};
// const vector_unit<float,4>::T vector_unit<float,4>::LARGE_VALUE = 1e8;
// const vector_unit<float,4>::T vector_unit<float,4>::SMALL_VALUE = -1e8;
// const vector_unit<float,4>::T vector_unit<float,4>::BIAS = 0;
//
// const double vector_unit<double,2>::LARGE_VALUE = 1e8;
// const double vector_unit<double,2>::SMALL_VALUE = -1e8;
// const double vector_unit<double,2>::BIAS = 0;
#ifdef __clang__
#pragma clang diagnostic pop
#endif
#endif // !VEC_UNIT_ENABLE_FLOAT
#ifdef HAVE_AVX
// AVX is the most pointless thing in the world, as far as integers are concerned.
// waiting for AVX5.7
// AVX 16x16bit vector unit
template<>
struct vector_unit<short, 16> {
const static bool do_checks = false;
typedef __m256i vec_t;
typedef short T;
const static T SMALL_VALUE = -32000;
const static T BIAS = 0;
const static size_t W = 8;
static inline vec_t setzero() {
return _mm256_setzero_si256();
}
static inline vec_t set1( T val ) {
return _mm256_set1_epi16( val );
}
static inline void store( const vec_t &v, T *addr ) {
if( do_checks && addr == 0 ) {
throw std::runtime_error( "store: addr == 0" );
}
//std::cout << "store to: " << addr << "\n";
_mm256_store_si256( (vec_t*)addr, v );
}
static inline const vec_t load( T* addr ) {
return _mm256_load_si256( (vec_t*)addr );
}
static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
__m128i lowa = _mm256_castsi256_si128(a);
const __m128i lowb = _mm256_castsi256_si128(b);
lowa = _mm_and_si128( lowa, lowb );
__m128i higha = _mm256_extractf128_si256(a,1);
const __m128i highb = _mm256_extractf128_si256(b,1);
higha = _mm_and_si128( higha, highb );
return _mm256_insertf128_si256( _mm256_castsi128_si256(lowa), higha, 1 );
}
static inline const vec_t bit_andnot( const vec_t &a, const vec_t &b ) {
__m128i lowa = _mm256_castsi256_si128(a);
const __m128i lowb = _mm256_castsi256_si128(b);
lowa = _mm_andnot_si128( lowa, lowb );
__m128i higha = _mm256_extractf128_si256(a,1);
const __m128i highb = _mm256_extractf128_si256(b,1);
higha = _mm_andnot_si128( higha, highb );
return _mm256_insertf128_si256( _mm256_castsi128_si256(lowa), higha, 1 );
}
// static inline const vec_t bit_invert( const vec_t &a ) {
// //return _mm_andnot_pd(a, set1(0xffff));
// }
static inline const vec_t add( const vec_t &a, const vec_t &b ) {
const __m128i lowa = _mm256_extractf128_si256(a,0);
const __m128i lowb = _mm256_extractf128_si256(b,0);
//lowa = _mm_add_epi16( lowa, lowb );
const __m128i higha = _mm256_extractf128_si256(a,1);
const __m128i highb = _mm256_extractf128_si256(b,1);
//higha = _mm_add_epi16( higha, highb );
return _mm256_insertf128_si256( _mm256_castsi128_si256(_mm_add_epi16( lowa, lowb )), _mm_add_epi16( higha, highb ), 1 );
}
static inline const vec_t sub( const vec_t &a, const vec_t &b ) {
__m128i lowa = _mm256_castsi256_si128(a);
const __m128i lowb = _mm256_castsi256_si128(b);
lowa = _mm_sub_epi16( lowa, lowb );
__m128i higha = _mm256_extractf128_si256(a,1);
const __m128i highb = _mm256_extractf128_si256(b,1);
higha = _mm_sub_epi16( higha, highb );
return _mm256_insertf128_si256( _mm256_castsi128_si256(lowa), higha, 1 );
}
static inline const vec_t cmp_zero( const vec_t &a ) {
return cmp_eq( a, setzero() );
}
static inline const vec_t cmp_eq( const vec_t &a, const vec_t &b ) {
__m128i lowa = _mm256_castsi256_si128(a);
const __m128i lowb = _mm256_castsi256_si128(b);
lowa = _mm_cmpeq_epi16( lowa, lowb );
__m128i higha = _mm256_extractf128_si256(a,1);
const __m128i highb = _mm256_extractf128_si256(b,1);
higha = _mm_cmpeq_epi16( higha, highb );
return _mm256_insertf128_si256( _mm256_castsi128_si256(lowa), higha, 1 );
}
static inline const vec_t cmp_lt( const vec_t &a, const vec_t &b ) {
__m128i lowa = _mm256_castsi256_si128(a);
const __m128i lowb = _mm256_castsi256_si128(b);
lowa = _mm_cmplt_epi16( lowa, lowb );
__m128i higha = _mm256_extractf128_si256(a,1);
const __m128i highb = _mm256_extractf128_si256(b,1);
higha = _mm_cmplt_epi16( higha, highb );
return _mm256_insertf128_si256( _mm256_castsi128_si256(lowa), higha, 1 );
}
static inline const vec_t min( const vec_t &a, const vec_t &b ) {
__m128i lowa = _mm256_castsi256_si128(a);
const __m128i lowb = _mm256_castsi256_si128(b);
lowa = _mm_min_epi16( lowa, lowb );
__m128i higha = _mm256_extractf128_si256(a,1);
const __m128i highb = _mm256_extractf128_si256(b,1);
higha = _mm_min_epi16( higha, highb );
return _mm256_insertf128_si256( _mm256_castsi128_si256(lowa), higha, 1 );
}
static inline const vec_t max( const vec_t &a, const vec_t &b ) {
__m128i lowa = _mm256_castsi256_si128(a);
const __m128i lowb = _mm256_castsi256_si128(b);
lowa = _mm_max_epi16( lowa, lowb );
__m128i higha = _mm256_extractf128_si256(a,1);
const __m128i highb = _mm256_extractf128_si256(b,1);
higha = _mm_max_epi16( higha, highb );
return _mm256_insertf128_si256( _mm256_castsi128_si256(lowa), higha, 1 );
}
};
template<>
struct vector_unit<double, 4> {
const static bool do_checks = false;
typedef __m256d vec_t;
typedef double T;
// const static uint64_t SIGN_MASK_U64 = 0x7FFFFFFFFFFFFFFF;
const static T LARGE_VALUE;// = 1e8;
const static T SMALL_VALUE; //= -1e8;
const static T BIAS;// = 0;
const static size_t W = 2;
static inline vec_t setzero() {
return _mm256_setzero_pd();
}
static inline vec_t set1( T val ) {
return _mm256_set1_pd( val );
}
static inline void store( const vec_t &v, T *addr ) {
if( do_checks && addr == 0 ) {
throw std::runtime_error( "store: addr == 0" );
}
//std::cout << "store to: " << addr << "\n";
_mm256_store_pd( (T*)addr, v );
}
static inline const vec_t load( const T* addr ) {
return _mm256_load_pd( (T*)addr );
}
#if 1
static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
return _mm256_and_pd( a, b );
}
static inline const vec_t bit_or( const vec_t &a, const vec_t &b ) {
return _mm256_or_pd( a, b );
}
static inline const vec_t bit_andnot( const vec_t &a, const vec_t &b ) {
return _mm256_andnot_pd( a, b );
}
#endif
// static inline const vec_t bit_invert( const vec_t &a ) {
// //return _mm_andnot_pd(a, set1(0xffff));
// }
static inline const vec_t add( const vec_t &a, const vec_t &b ) {
return _mm256_add_pd( a, b );
}
static inline const vec_t mul( const vec_t &a, const vec_t &b ) {
return _mm256_mul_pd( a, b );
}
static inline const vec_t adds( const vec_t &a, const vec_t &b ) {
// float add is always saturating, kind of!?
return _mm256_add_pd( a, b );
}
static inline const vec_t sub( const vec_t &a, const vec_t &b ) {
return _mm256_sub_pd( a, b );
}
static inline const vec_t cmp_zero( const vec_t &a ) {
return _mm256_cmp_pd( a, setzero(), _CMP_EQ_OQ );
}
static inline const vec_t cmp_eq( const vec_t &a, const vec_t &b ) {
// TODO: think about what this really means. Maybe use something epsilon-based as a default for float?
return _mm256_cmp_pd( a, b, _CMP_EQ_OQ );
}
static inline const vec_t cmp_lt( const vec_t &a, const vec_t &b ) {
return _mm256_cmp_pd( a, b, _CMP_LT_OQ );
}
static inline const vec_t min( const vec_t &a, const vec_t &b ) {
return _mm256_min_pd( a, b );
}
static inline const vec_t max( const vec_t &a, const vec_t &b ) {
return _mm256_max_pd( a, b );
}
static inline const vec_t abs_diff( const vec_t &a, const vec_t &b ) {
// i don't really like this function, as ideally this would just be abs(sub(a,b)),
// but there doesn't seem to be a fast way to implement abs on pre SSSSSSE3.
// The max(sub(a,b),sub(b,a)) work-around seems to be the next-best thing in this special case.
//unsigned int SIGN_MASK[4] = {0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF};
//unsigned int SIGN_MASK = 0x7FFFFFFF;
const uint64_t SIGN_MASK_U64x = 0x7fffffffffffffff;
const double *SIGN_MASK_PTR = (double*)&SIGN_MASK_U64x;
double SIGN_MASK = *SIGN_MASK_PTR;
return bit_and(sub(a,b), set1(SIGN_MASK) ); // TODO: could this case any alignment problems?
//return bit_and(sub(a,b), set1(*((float*)&SIGN_MASK_INT) )); // TODO: could this case any alignment problems?
//return bit_and(sub(a,b), load((float*)SIGN_MASK ));
}
static inline const vec_t abs( const vec_t &a ) {
// const uint64_t SIGN_MASK_U64x = 0x7fffffffffffffff;
//
// const static double *SIGN_MASK_PTR = (double*)&SIGN_MASK_U64x;
// static double SIGN_MASK = *SIGN_MASK_PTR;
//
// return bit_and( a, set1(SIGN_MASK));
//
//
return max( a, sub( setzero(), a ));
}
static inline void println( const vec_t & v ) {
double tmp[4];
_mm256_storeu_pd( tmp, v );
printf( "%f %f %f %f\n", tmp[0], tmp[1], tmp[2], tmp[3] );
}
};
const double vector_unit<double,4>::LARGE_VALUE = 1e8;
const double vector_unit<double,4>::SMALL_VALUE = -1e8;
const double vector_unit<double,4>::BIAS = 0;
#endif
// vector unit specialization: SSE 16x8bit integer
template<>
struct vector_unit<unsigned char, 16> {
const static bool do_checks = false;
typedef __m128i vec_t;
typedef unsigned char T;
const static size_t W = 16;
const static T SMALL_VALUE = 1;
const static T BIAS = 12;
static inline vec_t setzero() {
return set1(0);
}
static inline vec_t set1( T val ) {
return _mm_set1_epi8( val );
}
static inline void store( const vec_t &v, T *addr ) {
if( do_checks && addr == 0 ) {
throw std::runtime_error( "store: addr == 0" );
}
//std::cout << "store to: " << addr << "\n";
_mm_store_si128( (__m128i*)addr, v );
}
static inline const vec_t load( T* addr ) {
return (vec_t)_mm_load_si128( (__m128i *) addr );
}
static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
return _mm_and_si128( a, b );
}
static inline const vec_t add( const vec_t &a, const vec_t &b ) {
return _mm_add_epi8( a, b );
}
static inline const vec_t sub( const vec_t &a, const vec_t &b ) {
return _mm_sub_epi8( a, b );
}
static inline const vec_t cmp_zero( const vec_t &a ) {
return _mm_cmpeq_epi8( a, setzero() );
}
static inline const vec_t min( const vec_t &a, const vec_t &b ) {
return _mm_min_epu8( a, b );
}
static inline const vec_t max( const vec_t &a, const vec_t &b ) {
return _mm_max_epu8( a, b );
}
static inline void println( const vec_t &v, T *tmp ) {
store(v, tmp);
std::cout << "(";
for( size_t i = 0; i < W; i++ ) {
std::cout << int(tmp[i]) << ((i < W-1) ? "," : ")");
}
std::cout << std::endl;
}
};
template<>
struct vector_unit<short, 1> {
const static bool do_checks = false;
typedef __m128i vec_t;
typedef short T;
const static size_t W = 1;
static inline vec_t setzero() {
return set1(0);
}
static inline vec_t set1( T val ) {
return _mm_set1_epi16( val );
}
static inline void store( const vec_t &v, T *addr ) {
if( do_checks && addr == 0 ) {
throw std::runtime_error( "store: addr == 0" );
}
//std::cout << "store to: " << addr << "\n";
//_mm_store_pd( (float*)addr, (__m128)v );
*addr = _mm_extract_epi16( v, 0 );
}
static inline const vec_t load( T* addr ) {
//return (vec_t)_mm_load_pd( (float *) addr );
return set1(*addr);
}
static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
return _mm_and_si128( a, b );
}
static inline const vec_t add( const vec_t &a, const vec_t &b ) {
return _mm_add_epi16( a, b );
}
static inline const vec_t cmp_zero( const vec_t &a ) {
return _mm_cmpeq_epi16( a, setzero() );
}
static inline const vec_t min( const vec_t &a, const vec_t &b ) {
return _mm_min_epi16( a, b );
}
};
// vector unit specialization: SSE scalar 16bit integer
// template<>
// struct vector_unit<short, 1> {
//
// const static bool do_checks = false;
//
// typedef short vec_t;
// typedef short T;
//
// const static size_t W = 1;
//
// static inline vec_t setzero() {
// return 0;
// }
//
// static inline vec_t set1( T val ) {
// return val;
// }
//
// static inline void store( const vec_t &v, T *addr ) {
//
// if( do_checks && addr == 0 ) {
// throw std::runtime_error( "store: addr == 0" );
// }
//
// *addr = v;
//
// }
//
// static inline const vec_t load( T* addr ) {
// return *addr;
// }
//
// static inline const vec_t bit_and( const vec_t &a, const vec_t &b ) {
// return a & b;
// }
//
// static inline const vec_t add( const vec_t &a, const vec_t &b ) {
// return a + b;
// }
//
// static inline const vec_t cmp_zero( const vec_t &a ) {
// //return _mm_cmpeq_epi16( a, setzero() );
// return a == 0 ? 0xffff : 0x0;
// }
//
// static inline const vec_t min( const vec_t &a, const vec_t &b ) {
// return std::min( a, b );
// }
// };
#endif