Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sorting changes stage 1 #6668

Merged
merged 1 commit into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

### Breaking changes
* Support for upgrading from Realm files produced by RealmCore v5.23.9 or earlier is no longer supported.
* Remove `set_string_compare_method`, only one sort method is now supported which was previously called `STRING_COMPARE_CORE`.

### Compatibility
* Fileformat: Generates files with format v24. Reads and automatically upgrade from fileformat v10. If you want to upgrade from an earlier file format version you will have to use RealmCore v13.x.y or earlier.
Expand Down
19 changes: 0 additions & 19 deletions src/realm/query_conditions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -987,25 +987,6 @@ struct GreaterEqual : public HackClass {
static const int condition = -1;
};


// CompareLess is a temporary hack to have a generalized way to compare any realm types. Todo, enable correct <
// operator of StringData (currently gives circular header dependency with utf8.hpp)
template <class T>
struct CompareLess {
static bool compare(T v1, T v2, bool = false, bool = false)
{
return v1 < v2;
}
};
template <>
struct CompareLess<StringData> {
static bool compare(StringData v1, StringData v2, bool = false, bool = false)
{
bool ret = utf8_compare(v1.data(), v2.data());
return ret;
}
};

} // namespace realm

#endif // REALM_QUERY_CONDITIONS_HPP
184 changes: 41 additions & 143 deletions src/realm/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
*
**************************************************************************/

#include <realm/unicode.hpp>

#include <algorithm>
#include <clocale>
#include <vector>

#ifdef _WIN32
Expand All @@ -28,74 +31,11 @@
#include <ctype.h>
#endif

#include <realm/util/safe_int_ops.hpp>
#include <realm/unicode.hpp>

#include <clocale>

#ifdef _MSC_VER
#include <codecvt>
#else
#include <locale>
#endif

using namespace realm;

namespace {

std::wstring utf8_to_wstring(StringData str)
{
#if defined(_MSC_VER)
// __STDC_UTF_16__ seems not to work
static_assert(sizeof(wchar_t) == 2, "Expected Windows to use utf16");

// First get the number of chars needed for output buffer
int wchars_num = MultiByteToWideChar(CP_UTF8, 0, str.data(), -1, nullptr, 0);
auto wstr = std::make_unique<wchar_t[]>(wchars_num);
// Then convert
MultiByteToWideChar(CP_UTF8, 0, str.data(), -1, wstr.get(), wchars_num);
std::wstring w_result{wstr.get()};

return w_result;
#else
// gcc 4.7 and 4.8 do not yet support codecvt_utf8_utf16 and wstring_convert, and note that we can NOT just use
// setlocale() + mbstowcs() because setlocale is extremely slow and may change locale of the entire user process
static_cast<void>(str);
REALM_ASSERT(false);
return L"";
#endif
}

} // unnamed namespace


namespace realm {

// Highest character currently supported for *sorting* strings in Realm, when using STRING_COMPARE_CPP11.
constexpr size_t last_latin_extended_2_unicode = 591;

bool set_string_compare_method(string_compare_method_t method, StringCompareCallback callback)
{
if (method == STRING_COMPARE_CPP11) {
#if !REALM_ANDROID
std::string l = std::locale("").name();
// We cannot use C locale because it puts 'Z' before 'a'
if (l == "C")
return false;
#else
// If Realm wasn't compiled as C++11, just return false.
return false;
#endif
}
else if (method == STRING_COMPARE_CALLBACK) {
string_compare_callback = std::move(callback);
}

// other success actions
string_compare_method = method;
return true;
}

// clang-format off
// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
size_t sequence_length(char lead)
Expand All @@ -115,7 +55,7 @@ size_t sequence_length(char lead)
// Check if the next UTF-8 sequence in [begin, end) is identical to
// the one beginning at begin2. If it is, 'begin' is advanced
// accordingly.
inline bool equal_sequence(const char*& begin, const char* end, const char* begin2)
bool equal_sequence(const char*& begin, const char* end, const char* begin2)
{
if (begin[0] != begin2[0])
return false;
Expand Down Expand Up @@ -177,24 +117,6 @@ bool utf8_compare(StringData string1, StringData string2)
// come last. NOTE: This is a limitation of STRING_COMPARE_CORE until we get better such 'locale' support.

// clang-format off
static const uint32_t collation_order_core_similar[last_latin_extended_2_unicode + 1] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 456, 457, 458, 459, 460, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 461, 462, 463, 464, 8130, 465, 466, 467,
468, 469, 470, 471, 472, 473, 474, 475, 8178, 8248, 8433, 8569, 8690, 8805, 8912, 9002, 9093, 9182, 476, 477, 478, 479, 480, 481, 482, 9290, 9446, 9511, 9595, 9690, 9818, 9882, 9965, 10051, 10156, 10211, 10342, 10408, 10492, 10588,
10752, 10828, 10876, 10982, 11080, 11164, 11304, 11374, 11436, 11493, 11561, 483, 484, 485, 486, 487, 488, 9272, 9428, 9492, 9575, 9671, 9800, 9864, 9947, 10030, 10138, 10193, 10339, 10389, 10474, 10570, 10734, 10811, 10857, 10964, 11062, 11146, 11285, 11356,
11417, 11476, 11543, 489, 490, 491, 492, 27, 28, 29, 30, 31, 32, 493, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
494, 495, 8128, 8133, 8127, 8135, 496, 497, 498, 499, 9308, 500, 501, 59, 502, 503, 504, 505, 8533, 8669, 506, 12018, 507, 508, 509, 8351, 10606, 510, 8392, 8377, 8679, 511, 9317, 9315, 9329, 9353, 9348, 9341, 9383, 9545,
9716, 9714, 9720, 9732, 10078, 10076, 10082, 10086, 9635, 10522, 10615, 10613, 10619, 10640, 10633, 512, 10652, 11190, 11188, 11194, 11202, 11515, 11624, 11038, 9316, 9314, 9328, 9352, 9345, 9340, 9381, 9543, 9715, 9713, 9719, 9731, 10077, 10075, 10081, 10085,
9633, 10521, 10614, 10612, 10618, 10639, 10630, 513, 10651, 11189, 11187, 11193, 11199, 11514, 11623, 11521, 9361, 9360, 9319, 9318, 9359, 9358, 9536, 9535, 9538, 9537, 9542, 9541, 9540, 9539, 9620, 9619, 9626, 9625, 9744, 9743, 9718, 9717, 9736, 9735,
9742, 9741, 9730, 9729, 9909, 9908, 9907, 9906, 9913, 9912, 9915, 9914, 9989, 9988, 10000, 9998, 10090, 10089, 10095, 10094, 10080, 10079, 10093, 10092, 10091, 10120, 10113, 10112, 10180, 10179, 10240, 10239, 10856, 10322, 10321, 10326, 10325, 10324, 10323, 10340,
10337, 10328, 10327, 10516, 10515, 10526, 10525, 10520, 10519, 11663, 10567, 10566, 10660, 10659, 10617, 10616, 10638, 10637, 10689, 10688, 10901, 10900, 10907, 10906, 10903, 10902, 11006, 11005, 11010, 11009, 11018, 11017, 11012, 11011, 11109, 11108, 11104, 11103, 11132, 11131,
11215, 11214, 11221, 11220, 11192, 11191, 11198, 11197, 11213, 11212, 11219, 11218, 11401, 11400, 11519, 11518, 11522, 11583, 11582, 11589, 11588, 11587, 11586, 11027, 9477, 9486, 9488, 9487, 11657, 11656, 10708, 9568, 9567, 9662, 9664, 9667, 9666, 11594, 9774, 9779,
9784, 9860, 9859, 9937, 9943, 10014, 10135, 10129, 10266, 10265, 10363, 10387, 11275, 10554, 10556, 10723, 10673, 10672, 9946, 9945, 10802, 10801, 10929, 11653, 11652, 11054, 11058, 11136, 11139, 11138, 11141, 11232, 11231, 11282, 11347, 11537, 11536, 11597, 11596, 11613,
11619, 11618, 11621, 11645, 11655, 11654, 11125, 11629, 11683, 11684, 11685, 11686, 9654, 9653, 9652, 10345, 10344, 10343, 10541, 10540, 10539, 9339, 9338, 10084, 10083, 10629, 10628, 11196, 11195, 11211, 11210, 11205, 11204, 11209, 11208, 11207, 11206, 9773, 9351, 9350,
9357, 9356, 9388, 9387, 9934, 9933, 9911, 9910, 10238, 10237, 10656, 10655, 10658, 10657, 11616, 11615, 10181, 9651, 9650, 9648, 9905, 9904, 10015, 11630, 10518, 10517, 9344, 9343, 9386, 9385, 10654, 10653, 9365, 9364, 9367, 9366, 9752, 9751, 9754, 9753,
10099, 10098, 10101, 10100, 10669, 10668, 10671, 10670, 10911, 10910, 10913, 10912, 11228, 11227, 11230, 11229, 11026, 11025, 11113, 11112, 11542, 11541, 9991, 9990, 10557, 9668, 10731, 10730, 11601, 11600, 9355, 9354, 9738, 9737, 10636, 10635, 10646, 10645, 10648, 10647,
10650, 10649, 11528, 11527, 10382, 10563, 11142, 10182, 9641, 10848, 9409, 9563, 9562, 10364, 11134, 11048, 11606, 11660, 11659, 9478, 11262, 11354, 9769, 9768, 10186, 10185, 10855, 10854, 10936, 10935, 11535, 11534
};

static const uint32_t collation_order_core[last_latin_extended_2_unicode + 1] = {
0, 2, 3, 4, 5, 6, 7, 8, 9, 33, 34, 35, 36, 37, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 31, 38, 39, 40, 41, 42, 43, 29, 44, 45, 46, 76, 47, 30, 48, 49, 128, 132, 134, 137, 139, 140, 143, 144, 145, 146, 50, 51, 77, 78, 79, 52, 53, 148, 182, 191, 208, 229, 263, 267, 285, 295, 325, 333, 341, 360, 363, 385, 429, 433, 439, 454, 473, 491, 527, 531, 537, 539, 557, 54, 55, 56, 57, 58, 59, 147, 181, 190, 207,
228, 262, 266, 284, 294, 324, 332, 340, 359, 362, 384, 428, 432, 438, 453, 472, 490, 526, 530, 536, 538, 556, 60, 61, 62, 63, 28, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 32, 64, 72, 73, 74, 75, 65, 88, 66, 89, 149, 81, 90, 1, 91, 67, 92, 80, 136, 138, 68, 93, 94, 95, 69, 133, 386, 82, 129, 130, 131, 70, 153, 151, 157, 165, 575, 588, 570, 201, 233,
Expand All @@ -205,73 +127,49 @@ bool utf8_compare(StringData string1, StringData string2)
};
// clang-format on

bool use_internal_sort_order =
(string_compare_method == STRING_COMPARE_CORE) || (string_compare_method == STRING_COMPARE_CORE_SIMILAR);

if (use_internal_sort_order) {
// Core-only method. Compares in us_EN locale (sorting may be slightly inaccurate in some countries). Will
// return arbitrary return value for invalid utf8 (silent error treatment). If one or both strings have
// unicodes beyond 'Latin Extended 2' (0...591), then the strings are compared by unicode value.
uint32_t char1;
uint32_t char2;
do {
size_t remaining1 = string1.size() - (s1 - string1.data());
size_t remaining2 = string2.size() - (s2 - string2.data());

if ((remaining1 == 0) != (remaining2 == 0)) {
// exactly one of the strings have ended (not both or none; xor)
return (remaining1 == 0);
}
else if (remaining2 == 0 && remaining1 == 0) {
// strings are identical
return false;
}

// invalid utf8
if (remaining1 < sequence_length(s1[0]) || remaining2 < sequence_length(s2[0]))
return false;
// Core-only method. Compares in us_EN locale (sorting may be slightly inaccurate in some countries). Will
// return arbitrary return value for invalid utf8 (silent error treatment). If one or both strings have
// unicodes beyond 'Latin Extended 2' (0...591), then the strings are compared by unicode value.
uint32_t char1;
uint32_t char2;
do {
size_t remaining1 = string1.size() - (s1 - string1.data());
size_t remaining2 = string2.size() - (s2 - string2.data());

if ((remaining1 == 0) != (remaining2 == 0)) {
// exactly one of the strings have ended (not both or none; xor)
return (remaining1 == 0);
}
else if (remaining2 == 0 && remaining1 == 0) {
// strings are identical
return false;
}

char1 = utf8value(s1);
char2 = utf8value(s2);
// invalid utf8
if (remaining1 < sequence_length(s1[0]) || remaining2 < sequence_length(s2[0]))
return false;

if (char1 == char2) {
// Go to next characters for both strings
s1 += sequence_length(s1[0]);
s2 += sequence_length(s2[0]);
}
else {
// Test if above Latin Extended B
if (char1 > last_latin_extended_2_unicode || char2 > last_latin_extended_2_unicode)
return char1 < char2;
char1 = utf8value(s1);
char2 = utf8value(s2);

const uint32_t* internal_collation_order = collation_order_core;
if (string_compare_method == STRING_COMPARE_CORE_SIMILAR) {
internal_collation_order = collation_order_core_similar;
}
uint32_t value1 = internal_collation_order[char1];
uint32_t value2 = internal_collation_order[char2];
if (char1 == char2) {
// Go to next characters for both strings
s1 += sequence_length(s1[0]);
s2 += sequence_length(s2[0]);
}
else {
// Test if above Latin Extended B
if (char1 > last_latin_extended_2_unicode || char2 > last_latin_extended_2_unicode)
return char1 < char2;

return value1 < value2;
}
const uint32_t* internal_collation_order = collation_order_core;
uint32_t value1 = internal_collation_order[char1];
uint32_t value2 = internal_collation_order[char2];

} while (true);
}
else if (string_compare_method == STRING_COMPARE_CPP11) {
// C++11. Precise sorting in user's current locale. Arbitrary return value (silent error) for invalid utf8
std::wstring wstring1 = utf8_to_wstring(string1);
std::wstring wstring2 = utf8_to_wstring(string2);
std::locale l = std::locale("");
bool ret = l(wstring1, wstring2);
return ret;
}
else if (string_compare_method == STRING_COMPARE_CALLBACK) {
// Callback method
bool ret = string_compare_callback(s1, s2);
return ret;
}
return value1 < value2;
}

REALM_ASSERT(false);
return false;
} while (true);
}

// Converts UTF-8 source into upper or lower case. This function
Expand Down
48 changes: 0 additions & 48 deletions src/realm/unicode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#ifndef REALM_UNICODE_HPP
#define REALM_UNICODE_HPP

#include <locale>
#include <cstdint>
#include <string>

Expand All @@ -30,53 +29,6 @@

namespace realm {

enum string_compare_method_t {
STRING_COMPARE_CORE,
STRING_COMPARE_CPP11,
STRING_COMPARE_CALLBACK,
STRING_COMPARE_CORE_SIMILAR
};

extern StringCompareCallback string_compare_callback;
extern string_compare_method_t string_compare_method;

// Description for set_string_compare_method():
//
// Short summary: iOS language binding: call
// set_string_compare_method() for fast but slightly inaccurate sort in some countries, or
// set_string_compare_method(2, callbackptr) for slow but precise sort (see callbackptr below)
//
// Different countries ('locales') have different sorting order for strings and letters. Because there unfortunatly
// doesn't exist any unified standardized way to compare strings in C++ on multiple platforms, we need this method.
//
// It determins how sorting a TableView by a String column must take place. The 'method' argument can be:
//
// 0: Fast core-only compare (no OS/framework calls). LIMITATIONS: Works only upto 'Latin Extended 2' (unicodes
// 0...591). Also, sorting order is according to 'en_US' so it may be slightly inaccurate for some countries.
// 'callback' argument is ignored.
//
// Return value: Always 'true'
//
// 1: Native C++11 method if core is compiled as C++11. Gives precise sorting according
// to user's current locale. LIMITATIONS: Currently works only on Windows and on Linux with clang. Does NOT work on
// iOS (due to only 'C' locale being available in CoreFoundation, which puts 'Z' before 'a'). Unknown if works on
// Windows Phone / Android. Furthermore it does NOT work on Linux with gcc 4.7 or 4.8 (lack of c++11 feature that
// can convert utf8->wstring without calls to setlocale()).
//
// Return value: 'true' if supported, otherwise 'false' (if so, then previous setting, if any, is preserved).
//
// 2: Callback method. Language binding / C++ user must provide a utf-8 callback method of prototype:
// bool callback(const char* string1, const char* string2) where 'callback' must return bool(string1 < string2).
//
// Return value: Always 'true'
//
// Default is method = 0 if the function is never called
//
// NOT THREAD SAFE! Call once during initialization or make sure it's not called simultaneously with different
// arguments. The setting is remembered per-process; it does NOT need to be called prior to each sort
bool set_string_compare_method(string_compare_method_t method, StringCompareCallback callback);


// Return size in bytes of utf8 character. No error checking
size_t sequence_length(char lead);

Expand Down
3 changes: 0 additions & 3 deletions src/realm/utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,6 @@ namespace realm {
signed char sse_support = -1;
signed char avx_support = -1;

StringCompareCallback string_compare_callback = nullptr;
string_compare_method_t string_compare_method = STRING_COMPARE_CORE;

void cpuid_init()
{
#ifdef REALM_COMPILER_SSE
Expand Down
Loading