Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

investigate loop unrolling #317

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 72 additions & 51 deletions src/westmere/sse_convert_latin1_to_utf8.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,20 @@
std::pair<const char* const, char* const> sse_convert_latin1_to_utf8(
#define EAT_LATIN_(n) \
if (_mm_testz_si128(v_latin_##n, v_80)) {/* ASCII fast path!!!! */ \
_mm_storeu_si128((__m128i*)utf8_output, v_latin_##n); \
utf8_output += 16; \
} \
else { \
/* assuming a/b are bytes and A/B are uint16 of the same value*/ \
/* aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA */ \
const __m128i v_u16_latin_1_half_##n = _mm_shuffle_epi8(v_latin_##n, latin_1_half_into_u16_byte_mask); \
/* aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB */ \
const __m128i v_u16_latin_2_half_##n = _mm_shuffle_epi8(v_latin_##n, latin_2_half_into_u16_byte_mask); \
\
internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half_##n, utf8_output, v_0000, v_ff80); \
internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half_##n, utf8_output, v_0000, v_ff80); \
}

std::pair<const char*, char*> sse_convert_latin1_to_utf8(
const char* latin_input,
const size_t latin_input_length,
char* utf8_output) {
Expand All @@ -11,72 +27,77 @@ std::pair<const char* const, char* const> sse_convert_latin1_to_utf8(
const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);

const __m128i latin_1_half_into_u16_byte_mask = _mm_setr_epi8(
0, '\x80',
1, '\x80',
2, '\x80',
3, '\x80',
4, '\x80',
5, '\x80',
6, '\x80',
7, '\x80'
);
0, '\x80',
1, '\x80',
2, '\x80',
3, '\x80',
4, '\x80',
5, '\x80',
6, '\x80',
7, '\x80'
);

const __m128i latin_2_half_into_u16_byte_mask = _mm_setr_epi8(
8, '\x80',
9, '\x80',
10, '\x80',
11, '\x80',
12, '\x80',
13, '\x80',
14, '\x80',
15, '\x80'
);
8, '\x80',
9, '\x80',
10, '\x80',
11, '\x80',
12, '\x80',
13, '\x80',
14, '\x80',
15, '\x80'
);

// each latin1 takes 1-2 utf8 bytes
// slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then adjust the pointer)
// so the last write can exceed the utf8_output size by 8-1 bytes
// by reserving 8 extra input bytes, we expect the output to have 8-16 bytes free
while (latin_input + 16 + 8 <= end) {

// loop unroll depth 6
while (latin_input + 16 * 6 + 8 <= end) {
// Load 16 Latin1 characters (16 bytes) into a 128-bit register
__m128i v_latin = _mm_loadu_si128((__m128i*)latin_input);
const __m128i v_latin_1 = _mm_loadu_si128((__m128i*)latin_input);
const __m128i v_latin_2 = _mm_loadu_si128((__m128i*)latin_input + 1);
const __m128i v_latin_3 = _mm_loadu_si128((__m128i*)latin_input + 2);
const __m128i v_latin_4 = _mm_loadu_si128((__m128i*)latin_input + 3);
const __m128i v_latin_5 = _mm_loadu_si128((__m128i*)latin_input + 4);
const __m128i v_latin_6 = _mm_loadu_si128((__m128i*)latin_input + 5);

EAT_LATIN_(1);
EAT_LATIN_(2);
EAT_LATIN_(3);
EAT_LATIN_(4);
EAT_LATIN_(5);
EAT_LATIN_(6);

if (_mm_testz_si128(v_latin, v_80)) {// ASCII fast path!!!!
_mm_storeu_si128((__m128i*)utf8_output, v_latin);
latin_input += 16;
utf8_output += 16;
continue;
}

latin_input += 16 * 6;
}

// assuming a/b are bytes and A/B are uint16 of the same value
// aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
__m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
// aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
__m128i v_u16_latin_2_half = _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);

// loop unroll depth 3
while (latin_input + 16 * 3 + 8 <= end) {
// Load 16 Latin1 characters (16 bytes) into a 128-bit register
const __m128i v_latin_1 = _mm_loadu_si128((__m128i*)latin_input);
const __m128i v_latin_2 = _mm_loadu_si128((__m128i*)latin_input + 1);
const __m128i v_latin_3 = _mm_loadu_si128((__m128i*)latin_input + 2);

internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half, utf8_output, v_0000, v_ff80);
latin_input += 16;
EAT_LATIN_(1);
EAT_LATIN_(2);
EAT_LATIN_(3);

latin_input += 16 * 3;
}

if (latin_input + 16 <= end) {
while (latin_input + 16 + 8 <= end) {
// Load 16 Latin1 characters (16 bytes) into a 128-bit register
__m128i v_latin = _mm_loadu_si128((__m128i*)latin_input);

if (_mm_testz_si128(v_latin, v_80)) {// ASCII fast path!!!!
_mm_storeu_si128((__m128i*)utf8_output, v_latin);
latin_input += 16;
utf8_output += 16;
} else {
// assuming a/b are bytes and A/B are uint16 of the same value
// aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
__m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
latin_input += 8;
}
const __m128i v_latin_1 = _mm_loadu_si128((__m128i*)latin_input);

EAT_LATIN_(1);

latin_input += 16;
}

return std::make_pair(latin_input, utf8_output);
};
};

#undef EAT_LATIN_
Loading