Skip to content

Commit

Permalink
Working on PPC64LE support
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Mar 14, 2020
1 parent 0973417 commit a07a266
Show file tree
Hide file tree
Showing 11 changed files with 199 additions and 50 deletions.
11 changes: 9 additions & 2 deletions .travis.yml
Expand Up @@ -19,6 +19,12 @@ jobs:
- *default_packages
- build-essential
arch: arm64
- addons:
apt:
packages:
- *default_packages
- build-essential
arch: ppc64le
- addons:
apt:
packages:
Expand Down Expand Up @@ -85,6 +91,8 @@ jobs:
- mpi-default-bin
env: MPI=1 CC=gcc-9 CXX=g++-9
fast_finish: true
allow_failures:
- arch: ppc64le

before_install:
- export CC
Expand All @@ -93,8 +101,7 @@ before_install:
script:
- |
mkdir build; cd build; \
[[ "${TRAVIS_CPU_ARCH}" = "arm64" ]] && SIMD="" || SIMD="-DHAVE_SSE4_1=1"; \
cmake -DHAVE_MPI="$([[ -z "$MPI" ]]; echo $?)" "$SIMD" -DENABLE_WERROR=1 -DHAVE_TESTS=1 ..; \
cmake -DHAVE_MPI="$([[ -z "$MPI" ]]; echo $?)" -DENABLE_WERROR=1 -DHAVE_TESTS=1 ..; \
make -j $(nproc --all); \
mkdir path; \
printf '#!/bin/sh\n/usr/bin/tee "$@" | tail\n' > path/tee; \
Expand Down
3 changes: 3 additions & 0 deletions CMakeLists.txt
Expand Up @@ -46,6 +46,9 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
set(ARM 1)
set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -DNEON=1 -DSSE=1 -fsigned-char")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "PPC64*|ppc64*|powerpc64*")
set(PPC64 1)
set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -DSSE=1 -std=gnu++0x -mcpu=power8 -mvsx")
elseif (EMSCRIPTEN)
set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -DSSE=1 -DWASM=1 -msimd128 -s WASM=1 -s ASSERTIONS=1")
else ()
Expand Down
73 changes: 35 additions & 38 deletions Dockerfile
@@ -1,63 +1,60 @@
ARG NAMESPACE=
FROM debian:stable-slim as qemu-downloader
ARG NAMESPACE
RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
RUN if [ X"$NAMESPACE" = X"arm64v8/" ]; then \
wget -nv -O "/usr/bin/qemu-aarch64-static" https://github.com/multiarch/qemu-user-static/releases/download/v3.1.0-2/qemu-aarch64-static; \
else \
echo -e '#!/bin/sh\n"$@"\n' > "/usr/bin/qemu-aarch64-static"; \
RUN if [ X"$NAMESPACE" != X"" ]; then \
apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*; \
fi; \
chmod +x /usr/bin/qemu-aarch64-static;
if [ X"$NAMESPACE" = X"ppc64le/" ]; then \
wget -nv -O /usr/bin/qemu-ppc64le-static https://github.com/multiarch/qemu-user-static/releases/download/v4.2.0-4/qemu-ppc64le-static; \
chmod +x /usr/bin/qemu-ppc64le-static; \
fi; \
if [ X"$NAMESPACE" = X"aarch64/" ]; then \
wget -nv -O /usr/bin/qemu-aarch64-static https://github.com/multiarch/qemu-user-static/releases/download/v4.2.0-4/qemu-aarch64-static; \
chmod +x /usr/bin/qemu-aarch64-static; \
fi; \
touch /usr/bin/dummy_copy

FROM ${NAMESPACE}debian:stable-slim as mmseqs-builder
FROM ${NAMESPACE}debian:stable-slim as builder
ARG NAMESPACE
COPY --from=qemu-downloader /usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static
COPY --from=qemu-downloader /usr/bin/dummy_copy /usr/bin/qemu-aarch64-static* /usr/bin/qemu-ppc64le-static* /usr/bin/

RUN apt-get update && apt-get install -y \
build-essential cmake xxd git zlib1g-dev libbz2-dev \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /opt/mmseqs
ADD . .
RUN mkdir -p build_sse/bin && mkdir -p build_avx/bin && mkdir -p build_neon/bin

WORKDIR /opt/mmseqs/build_sse
RUN if [ X"$NAMESPACE" = X"" ]; then \
cmake -DHAVE_SSE4_1=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
make -j $(nproc --all) && make install; \
fi

WORKDIR /opt/mmseqs/build_avx
RUN if [ X"$NAMESPACE" = X"" ]; then \
cmake -DHAVE_AVX2=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
make -j $(nproc --all) && make install; \
fi

WORKDIR /opt/mmseqs/build_neon
RUN if [ X"$NAMESPACE" = X"arm64v8/" ]; then \
cmake -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
make -j $(nproc --all) && make install; \
touch /opt/mmseqs/build_sse/bin/mmseqs; \
touch /opt/mmseqs/build_avx/bin/mmseqs; \
else \
touch /opt/mmseqs/build_neon/bin/mmseqs; \
fi

RUN mkdir -p build_sse/src && mkdir -p build_avx/src && mkdir -p build/src; \
if [ X"$NAMESPACE" = X"" ]; then \
cd /opt/mmseqs/build_sse; \
cmake -DHAVE_SSE4_1=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
make -j $(nproc --all); \
mv src/mmseqs /opt/mmseqs/mmseqs_sse42; \
cd /opt/mmseqs/build_avx; \
cmake -DHAVE_AVX2=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
make -j $(nproc --all); \
mv src/mmseqs /opt/mmseqs/mmseqs_avx2; \
else \
cd /opt/mmseqs/build; \
cmake -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
make -j $(nproc --all); \
mv src/mmseqs /opt/mmseqs/mmseqs_arch; \
touch /opt/mmseqs/mmseqs_sse42 /opt/mmseqs/mmseqs_avx2; \
fi

FROM ${NAMESPACE}debian:stable-slim
ARG NAMESPACE
MAINTAINER Milot Mirdita <milot@mirdita.de>
COPY --from=qemu-downloader /usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static
COPY --from=qemu-downloader /usr/bin/dummy_copy /usr/bin/qemu-aarch64-static* /usr/bin/qemu-ppc64le-static* /usr/bin/

RUN apt-get update && apt-get install -y \
gawk bash grep libstdc++6 libgomp1 zlib1g libbz2-1.0 \
gawk bash grep libstdc++6 libgomp1 zlib1g libbz2-1.0 wget tar \
&& rm -rf /var/lib/apt/lists/*

COPY --from=mmseqs-builder /opt/mmseqs/build_sse/bin/mmseqs /usr/local/bin/mmseqs_sse42
COPY --from=mmseqs-builder /opt/mmseqs/build_avx/bin/mmseqs /usr/local/bin/mmseqs_avx2
COPY --from=mmseqs-builder /opt/mmseqs/build_neon/bin/mmseqs /usr/local/bin/mmseqs_neon
COPY --from=builder /opt/mmseqs/mmseqs_arch /opt/mmseqs/mmseqs_sse42 /opt/mmseqs/mmseqs_avx2 /usr/local/bin/
ADD util/mmseqs_wrapper.sh /usr/local/bin/mmseqs

RUN if [ X"$NAMESPACE" = X"arm64v8/" ]; then mv -f /usr/local/bin/mmseqs_neon /usr/local/bin/mmseqs; fi
RUN if [ X"$NAMESPACE" != X"" ]; then mv -f /usr/local/bin/mmseqs_arch /usr/local/bin/mmseqs; fi

CMD ["/usr/local/bin/mmseqs"]

8 changes: 7 additions & 1 deletion lib/ksw2/ksw2_extz2_sse.cpp
Expand Up @@ -43,8 +43,14 @@ See: https://github.com/lh3/minimap2
#define KSW_SSE2_ONLY
#endif

#ifdef __ALTIVEC__
#include "sse2altivec.h"
#define __SSE2__
#define KSW_SSE2_ONLY
#endif

#ifdef __SSE2__
#if !defined(NEON) && !defined(WASM)
#if !defined(NEON) && !defined(WASM) && !defined(__ALTIVEC__)
#include <emmintrin.h>
#endif

Expand Down
10 changes: 7 additions & 3 deletions lib/simd/simd.h
Expand Up @@ -56,9 +56,13 @@
#ifdef WASM
#include "sse2wasm.h"
#else
#ifdef __ALTIVEC__
#include "sse2altivec.h"
#else
#include <xmmintrin.h>
#endif
#endif
#endif

#ifdef AVX512
#include <zmmintrin.h.h> // AVX512
Expand Down Expand Up @@ -291,7 +295,7 @@ typedef __m256 simd_float;
#ifdef SSE
uint16_t simd_hmax16(const __m128i buffer);
uint8_t simd_hmax8(const __m128i buffer);
#if !defined(NEON) && !defined(WASM)
#if !defined(NEON) && !defined(WASM) && !defined(__ALTIVEC__)
#include <smmintrin.h> //SSE4.1
// double support
#ifndef SIMD_DOUBLE
Expand Down Expand Up @@ -401,7 +405,7 @@ typedef __m128i simd_int;
#endif //SIMD_INT
#endif //SSE

#if WASM
#if defined(WASM) || defined(__ALTIVEC__)
template <typename F>
inline F simd_hmax(const F * in, unsigned int n);

Expand Down Expand Up @@ -621,7 +625,7 @@ inline float ScalarProd20(const float* qi, const float* tj) {
//
//
//TODO fix this
#if defined(SSE) && !defined(WASM)
#if defined(SSE) && !defined(WASM) && !defined(__ALTIVEC__)
float __attribute__((aligned(16))) res;
__m128 P; // query 128bit SSE2 register holding 4 floats
__m128 R;// result
Expand Down
127 changes: 127 additions & 0 deletions lib/simd/sse2altivec.h
@@ -0,0 +1,127 @@
// sse2altivec is still very incomplete
// licensed under GPLv3 see LICENCE file
#ifndef SSE2ALTIVEC
#define SSE2ALTIVEC

// ignore all warnings
#pragma GCC system_header

#include <altivec.h>
#define SSE 1

typedef __vector double __m128d;
typedef __vector float __m128;
typedef __vector int __m128i;

typedef __vector signed char simd_s8;
typedef __vector unsigned char simd_u8;
typedef __vector signed short simd_s16;
typedef __vector unsigned short simd_u16;
typedef __vector int64_t simd_s64;
typedef __vector uint64_t simd_u64;

#define _mm_add_ps(x,y) (__m128)((__m128)(x) + (__m128)(y))
#define _mm_sub_ps(x,y) (__m128)((__m128)(x) - (__m128)(y))
#define _mm_mul_ps(x,y) (__m128)((__m128)(x) * (__m128)(y))
#define _mm_div_ps(x,y) (__m128)((__m128)(x) / (__m128)(y))
#define _mm_rcp_ps(x) (__m128)vec_re((__m128)(x))
#define _mm_max_ps(x,y) (__m128)vec_max((__m128)(x),(__m128)(y))
#define _mm_min_ps(x,y) (__m128)vec_min((__m128)(x),(__m128)(y))
#define _mm_load_ps(x) (__m128)vec_vsx_ld(0, (__m128 const*)(x))
#define _mm_store_ps(x,y) vec_vsx_st((__m128)(y),0,(__m128*)(x))
#define _mm_store_ss(x,y) vec_vsx_st((__m128)(y),0,(__m128*)(x))
#define _mm_set1_ps(x) (__m128)vec_splats((float)(x))
#define _mm_setzero_ps(x) (__m128)vec_splats((float)0)
#define _mm_cmpgt_ps(x,y) (__m128)vec_cmpgt((__m128)(x),(__m128)(y))
#define _mm_cmpeq_ps(x,y) (__m128)vec_cmpeq((__m128)(x),(__m128)(y))
#define _mm_cmplt_ps(x,y) (__m128)vec_cmplt((__m128)(x),(__m128)(y))
#define _mm_or_ps(x,y) (__m128)vec_or((__m128)(x),(__m128)(y))
#define _mm_and_ps(x,y) (__m128)vec_and((__m128)(x),(__m128)(y))
#define _mm_andnot_ps(x,y) (__m128)vec_andc((__m128)(x),(__m128)(y))
#define _mm_xor_ps(x,y) (__m128)vec_xor((__m128)(x),(__m128)(y))
#define _mm_cvtps_epi32(x) (__m128i)vec_cts((x),0)
#define _mm_castps_si128(x) (__m128i)(x)
#define _mm_add_epi32(x,y) (__m128i)vec_add((__m128i)(x),(__m128i)(y))
#define _mm_add_epi16(x,y) (__m128i)vec_add((simd_s16)(x),(simd_s16)(y))
#define _mm_add_epi8(x,y) (__m128i)vec_add((simd_s8)(x),(simd_s8)(y))
#define _mm_adds_epi16(x,y) (__m128i)vec_adds((simd_s16)(x),(simd_s16)(y))
#define _mm_adds_epu8(x,y) (__m128i)vec_adds((simd_u8)(x),(simd_u8)(y))
#define _mm_sub_epi32(x,y) (__m128i)vec_sub((__m128i)(x),(__m128i)(y))
#define _mm_sub_epi16(x,y) (__m128i)vec_sub((simd_s16)(x),(simd_s16)(y))
#define _mm_sub_epi8(x,y) (__m128i)vec_sub((simd_s8)(x),(simd_s8)(y))
#define _mm_subs_epu16(x,y) (__m128i)vec_subs((simd_u16)(x),(simd_u16)(y))
#define _mm_subs_epu8(x,y) (__m128i)vec_subs((simd_u8)(x),(simd_u8)(y))
#define _mm_mullo_epi32(x,y) (__m128i)vec_mul((__m128i)(x),(__m128i)(y))
#define _mm_max_epi32(x,y) (__m128i)vec_max((__m128i)(x),(__m128i)(y))
#define _mm_max_epi16(x,y) (__m128i)vec_max((simd_s16)(x),(simd_s16)(y))
#define _mm_max_epu8(x,y) (__m128i)vec_max((simd_u8)(x),(simd_u8)(y))
#define _mm_min_epu8(x,y) (__m128i)vec_min((simd_u8)(x),(simd_u8)(y))
#define _mm_load_si128(x) (__m128i)vec_vsx_ld(0,(__m128i const*)(x))
#define _mm_loadu_si128(x) (__m128i)vec_vsx_ld(0,(__m128i const*)(x))
#define _mm_storeu_si128(x,y) vec_vsx_st((__m128i)(y),0,(__m128i*)(x))
#define _mm_store_si128(x,y) vec_vsx_st((__m128i)(y),0,(__m128i*)(x))
#define _mm_set1_epi32(x) (__m128i)vec_splats((signed int)(x))
#define _mm_set1_epi16(x) (__m128i)vec_splats((signed short)(x))
#define _mm_set1_epi8(x) (__m128i)vec_splats((signed char)(x))
#define _mm_setzero_si128(x) (__m128i)vec_splats(0)
#define _mm_cmpgt_epi32(x,y) (__m128i)vec_cmpgt((__m128i)(x),(__m128i)(y))
#define _mm_cmpgt_epi16(x,y) (__m128i)vec_cmpgt((simd_s16)(x),(simd_s16)(y))
#define _mm_cmpgt_epi8(x,y) (__m128i)vec_cmpgt((simd_s8)(x),(simd_s8)(y))
#define _mm_cmpeq_epi32(x,y) (__m128i)vec_cmpeq((__m128i)(x),(__m128i)(y))
#define _mm_cmpeq_epi16(x,y) (__m128i)vec_cmpeq((simd_s16)(x),(simd_s16)(y))
#define _mm_cmpeq_epi8(x,y) (__m128i)vec_cmpeq((simd_s8)(x),(simd_s8)(y))
#define _mm_cmplt_epi32(x,y) (__m128i)vec_cmplt((__m128i)(x),(__m128i)(y))
#define _mm_cmplt_epi16(x,y) (__m128i)vec_cmplt((simd_s16)(x),(simd_s16)(y))
#define _mm_cmplt_epi8(x,y) (__m128i)vec_cmplt((simd_s8)(x),(simd_s8)(y))
#define _mm_or_si128(x,y) (__m128i)vec_or((__m128i)(x),(__m128i)(y))
#define _mm_and_si128(x,y) (__m128i)vec_and((__m128i)(x),(__m128i)(y))
#define _mm_andnot_si128(x,y) (__m128i)vec_andc((__m128i)(x),(__m128i)(y))
#define _mm_xor_si128(x,y) (__m128i)vec_xor((__m128i)(x),(__m128i)(y))
#define _mm_extract_epi16(x,imm) (int16_t)vec_extract((simd_s16)(x),(imm))
#define _mm_extract_epi8(x,imm) (int8_t)vec_extract((simd_s8)(x),(imm))
#define _mm_slli_epi16(x,y) (simd_s16)vec_sl((simd_s16)(x),vec_splats((unsigned short)(y)))
#define _mm_srli_epi16(x,y) (simd_s16)vec_sr((simd_s16)(x),vec_splats((unsigned short)(y)))
#define _mm_slli_epi32(x,y) (__m128i)vec_sl((__m128i)(x),vec_splats((unsigned int)(y)))
#define _mm_srli_epi32(x,y) (__m128i)vec_sr((__m128i)(x),vec_splats((unsigned int)(y)))
#define _mm_cvtepi32_ps(x) (__m128)vec_ctf((__m128i)(x),0)
#define _mm_castsi128_ps(x) (__m128)(x)
#define _mm_slli_si128(x,y) (__m128i)vec_slo((simd_u8)(x),(simd_u8)vec_splats((char)(y << 3)))
#define _mm_srli_si128(x,y) (__m128i)vec_sro((simd_u8)(x),(simd_u8)vec_splats((char)(y << 3)))
#define _mm_cvtsi128_si64(a) (int64_t)vec_extract((simd_s64)(a),0)
#define _mm_cvtsi128_si32(a) (int32_t)vec_extract((__m128i)(a),0)
#define _mm_cvtsi64_si128(a) (__m128i)((simd_s64){(int64_t)(a),0})
#define _mm_cvtsi32_si128(a) (__m128i){(int)(a),0,0,0}
#define _mm_packs_epi32(x,y) (simd_s16)vec_packs((__m128i)(x), (__m128i)(y))
#define _mm_packus_epi16(x,y) (simd_u8)vec_packsu((simd_s16)(x), (simd_s16)(y))
#define _mm_set_epi32(e3,e2,e1,e0) (__m128i){(e0),(e1),(e2),(e3)}
#define _mm_setr_epi32(e3,e2,e1,e0) (__m128i){(e3),(e2),(e1),(e0)}
#define _mm_set_epi16(e7,e6,e5,e4,e3,e2,e1,e0) \
(__m128i)((simd_s16){(e0),(e1),(e2),(e3),(e4),(e5),(e6),(e7)})
#define _mm_setr_epi16(e7,e6,e5,e4,e3,e2,e1,e0) \
(__m128i)((simd_s16){(e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)})
#define _mm_set_epi8(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) \
(__m128i)((simd_s8){(e0),(e1),(e2),(e3),(e4),(e5),(e6),(e7),(e8),(e9),(e10),(e11),(e12),(e13),(e14),(e15)})
#define _mm_setr_epi8(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) \
(__m128i)((simd_s8){(e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)})

static inline void _mm_storel_epi64(__m128i* mem_addr, __m128i a) {
*((int64_t*)mem_addr) = (int64_t)vec_extract((simd_s64)(a), 0);
}

// From OpenCV
// https://github.com/opencv/opencv/pull/15235
// 3-Clause BSD License
static inline unsigned short _mm_movemask_epi8(__m128i value) {
static const simd_u8 perm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
return vec_extract((__m128i)vec_vbpermq((simd_u8)value, perm), 2);
}

// From reedsolomon
// https://github.com/NicolasT/reedsolomon/blob/master/cbits/reedsolomon.c
// MIT License
static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {
const __m128i zero = (__m128i)vec_splats((unsigned char)0);
return (__m128i)vec_perm((simd_u8)a, (simd_u8)zero, (simd_u8)b);
}

#endif
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Expand Up @@ -121,7 +121,7 @@ if (HAVE_POSIX_MADVISE)
endif ()

# SIMD instruction sets support
if (ARM OR EMSCRIPTEN)
if (ARM OR PPC64 OR EMSCRIPTEN)
elseif (HAVE_AVX2)
target_compile_definitions(mmseqs-framework PUBLIC -DAVX2=1)
if (CMAKE_COMPILER_IS_CLANG)
Expand Down
2 changes: 1 addition & 1 deletion src/commons/Application.cpp
Expand Up @@ -4,7 +4,7 @@
#include "DistanceCalculator.h"
#include "Timer.h"

#if !defined(NEON) && !defined(WASM)
#if !defined(NEON) && !defined(WASM) && !defined(__ALTIVEC__)
#include <CpuInfo.h>
#else
#define NO_CPUINFO
Expand Down
2 changes: 1 addition & 1 deletion src/commons/Orf.cpp
Expand Up @@ -211,7 +211,7 @@ inline bool isInCodons(const char* sequence, simd_int codons, simd_int) {
// c: ATG0 ATG0 ATG0 ATG0
c = simdi_and(mask, c);
// t: FFFF 0000 0000 0000
simd_int test = simdi32_eq(shuf, codons);
simd_int test = simdi32_eq(c, codons);
#ifndef AVX2
if (N > 4) {
simd_int test2 = simdi32_eq(c, codons2);
Expand Down
2 changes: 1 addition & 1 deletion src/commons/Util.cpp
Expand Up @@ -604,7 +604,7 @@ uint64_t Util::revComplement(const uint64_t kmer, const int k) {
// create lookup (set 16 bytes in 128 bit)
// a lookup entry at the index of two nucleotides (4 bit) describes the reverse
// complement of these two nucleotide in the higher 4 bits (lookup1) or in the lower 4 bits (lookup2)
#define c (char)
#define c (signed char)
__m128i lookup1 = _mm_set_epi8(c(0x50),c(0x10),c(0xD0),c(0x90),c(0x40),c(0x00),c(0xC0),c(0x80),
c(0x70),c(0x30),c(0xF0),c(0xB0),c(0x60),c(0x20),c(0xE0),c(0xA0));
__m128i lookup2 = _mm_set_epi8(c(0x05),c(0x01),c(0x0D),c(0x09),c(0x04),c(0x00),c(0x0C),c(0x08),
Expand Down
9 changes: 7 additions & 2 deletions src/commons/itoa.h
Expand Up @@ -22,9 +22,10 @@ THE SOFTWARE.
*/
// SSE2 implementation according to http://0x80.pl/articles/sse-itoa.html
// Modifications: (1) fix incorrect digits (2) accept all ranges (3) write to user provided buffer.
#include <stdint.h>
#include <cstdint>

#ifdef WASM
#if defined(WASM) || defined(__ALTIVEC__)
#include <cstdio>
class Itoa{
public:
static char* u32toa_sse2(uint32_t value, char* buffer) {
Expand All @@ -44,8 +45,12 @@ class Itoa{
#ifdef NEON
#include "sse2neon.h"
#else
#ifdef __ALTIVEC__
#include "sse2altivec.h"
#else
#include <emmintrin.h>
#endif
#endif


#define ALIGN_PRE
Expand Down

0 comments on commit a07a266

Please sign in to comment.