Skip to content

Commit

Permalink
Canonicalize ICU locale names to language tags.
Browse files Browse the repository at this point in the history
Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut
  • Loading branch information
jeff-davis committed Apr 4, 2023
1 parent d3d53f9 commit ea1db8a
Show file tree
Hide file tree
Showing 10 changed files with 258 additions and 27 deletions.
2 changes: 1 addition & 1 deletion doc/src/sgml/charset.sgml
Expand Up @@ -893,7 +893,7 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE');
The first example selects the ICU locale using a <quote>language
tag</quote> per BCP 47. The second example uses the traditional
ICU-specific locale syntax. The first style is preferred going
forward, but it is not supported by older ICU versions.
forward, and is used internally to store locales.
</para>
<para>
Note that you can name the collation objects in the SQL environment
Expand Down
46 changes: 25 additions & 21 deletions src/backend/commands/collationcmds.c
Expand Up @@ -165,6 +165,11 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
else
colliculocale = NULL;

/*
* When the ICU locale comes from an existing collation, do not
* canonicalize to a language tag.
*/

datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
if (!isnull)
collicurules = TextDatumGetCString(datum);
Expand Down Expand Up @@ -259,6 +264,25 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("parameter \"locale\" must be specified")));

/*
* During binary upgrade, preserve the locale string. Otherwise,
* canonicalize to a language tag.
*/
if (!IsBinaryUpgrade)
{
char *langtag = icu_language_tag(colliculocale,
icu_validation_level);

if (langtag && strcmp(colliculocale, langtag) != 0)
{
ereport(NOTICE,
(errmsg("using standard form \"%s\" for locale \"%s\"",
langtag, colliculocale)));

colliculocale = langtag;
}
}

icu_validate_locale(colliculocale);
}

Expand Down Expand Up @@ -569,26 +593,6 @@ cmpaliases(const void *a, const void *b)


#ifdef USE_ICU
/*
* Get the ICU language tag for a locale name.
* The result is a palloc'd string.
*/
static char *
get_icu_language_tag(const char *localename)
{
char buf[ULOC_FULLNAME_CAPACITY];
UErrorCode status;

status = U_ZERO_ERROR;
uloc_toLanguageTag(localename, buf, sizeof(buf), true, &status);
if (U_FAILURE(status))
ereport(ERROR,
(errmsg("could not convert locale name \"%s\" to language tag: %s",
localename, u_errorName(status))));

return pstrdup(buf);
}

/*
* Get a comment (specifically, the display name) for an ICU locale.
* The result is a palloc'd string, or NULL if we can't get a comment
Expand Down Expand Up @@ -950,7 +954,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
else
name = uloc_getAvailable(i);

langtag = get_icu_language_tag(name);
langtag = icu_language_tag(name, ERROR);

/*
* Be paranoid about not allowing any non-ASCII strings into
Expand Down
20 changes: 20 additions & 0 deletions src/backend/commands/dbcommands.c
Expand Up @@ -1058,6 +1058,26 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ICU locale must be specified")));

/*
* During binary upgrade, or when the locale came from the template
* database, preserve locale string. Otherwise, canonicalize to a
* language tag.
*/
if (!IsBinaryUpgrade && dbiculocale != src_iculocale)
{
char *langtag = icu_language_tag(dbiculocale,
icu_validation_level);

if (langtag && strcmp(dbiculocale, langtag) != 0)
{
ereport(NOTICE,
(errmsg("using standard form \"%s\" for locale \"%s\"",
langtag, dbiculocale)));

dbiculocale = langtag;
}
}

icu_validate_locale(dbiculocale);
}
else
Expand Down
85 changes: 85 additions & 0 deletions src/backend/utils/adt/pg_locale.c
Expand Up @@ -2826,6 +2826,91 @@ icu_set_collation_attributes(UCollator *collator, const char *loc,

#endif

/*
* Return the BCP47 language tag representation of the requested locale.
*
* This function should be called before passing the string to ucol_open(),
* because conversion to a language tag also performs "level 2
* canonicalization". In addition to producing a consistent format, level 2
* canonicalization is able to more accurately interpret different input
* locale string formats, such as POSIX and .NET IDs.
*/
char *
icu_language_tag(const char *loc_str, int elevel)
{
#ifdef USE_ICU
UErrorCode status;
char lang[ULOC_LANG_CAPACITY];
char *langtag;
size_t buflen = 32; /* arbitrary starting buffer size */
const bool strict = true;

status = U_ZERO_ERROR;
uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
if (U_FAILURE(status))
{
if (elevel > 0)
ereport(elevel,
(errmsg("could not get language from locale \"%s\": %s",
loc_str, u_errorName(status))));
return NULL;
}

/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
return pstrdup("en-US-u-va-posix");

/*
* A BCP47 language tag doesn't have a clearly-defined upper limit
* (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
* uloc_toLanguageTag() doesn't always return the ultimate length on the
* first call, necessitating a loop.
*/
langtag = palloc(buflen);
while (true)
{
int32_t len;

status = U_ZERO_ERROR;
len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);

/*
* If the result fits in the buffer exactly (len == buflen),
* uloc_toLanguageTag() will return success without nul-terminating
* the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
* buflen and try again.
*/
if ((status == U_BUFFER_OVERFLOW_ERROR ||
(U_SUCCESS(status) && len >= buflen)) &&
buflen < MaxAllocSize)
{
buflen = Min(buflen * 2, MaxAllocSize);
langtag = repalloc(langtag, buflen);
continue;
}

break;
}

if (U_FAILURE(status))
{
pfree(langtag);

if (elevel > 0)
ereport(elevel,
(errmsg("could not convert locale name \"%s\" to language tag: %s",
loc_str, u_errorName(status))));
return NULL;
}

return langtag;
#else /* not USE_ICU */
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("ICU is not supported in this build")));
#endif /* not USE_ICU */
}

/*
* Perform best-effort check that the locale is a valid one.
*/
Expand Down
81 changes: 81 additions & 0 deletions src/bin/initdb/initdb.c
Expand Up @@ -2229,6 +2229,78 @@ check_icu_locale_encoding(int user_enc)
return true;
}

/*
* Convert to canonical BCP47 language tag. Must be consistent with
* icu_language_tag().
*/
static char *
icu_language_tag(const char *loc_str)
{
#ifdef USE_ICU
UErrorCode status;
char lang[ULOC_LANG_CAPACITY];
char *langtag;
size_t buflen = 32; /* arbitrary starting buffer size */
const bool strict = true;

status = U_ZERO_ERROR;
uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
if (U_FAILURE(status))
{
pg_fatal("could not get language from locale \"%s\": %s",
loc_str, u_errorName(status));
return NULL;
}

/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
return pstrdup("en-US-u-va-posix");

/*
* A BCP47 language tag doesn't have a clearly-defined upper limit
* (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
* uloc_toLanguageTag() doesn't always return the ultimate length on the
* first call, necessitating a loop.
*/
langtag = pg_malloc(buflen);
while (true)
{
int32_t len;

status = U_ZERO_ERROR;
len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);

/*
* If the result fits in the buffer exactly (len == buflen),
* uloc_toLanguageTag() will return success without nul-terminating
* the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
* buflen and try again.
*/
if (status == U_BUFFER_OVERFLOW_ERROR ||
(U_SUCCESS(status) && len >= buflen))
{
buflen = buflen * 2;
langtag = pg_realloc(langtag, buflen);
continue;
}

break;
}

if (U_FAILURE(status))
{
pg_free(langtag);

pg_fatal("could not convert locale name \"%s\" to language tag: %s",
loc_str, u_errorName(status));
}

return langtag;
#else
pg_fatal("ICU is not supported in this build");
#endif
}

/*
* Perform best-effort check that the locale is a valid one. Should be
* consistent with pg_locale.c, except that it doesn't need to open the
Expand Down Expand Up @@ -2376,13 +2448,22 @@ setlocales(void)

if (locale_provider == COLLPROVIDER_ICU)
{
char *langtag;

/* acquire default locale from the environment, if not specified */
if (icu_locale == NULL)
{
icu_locale = default_icu_locale();
printf(_("Using default ICU locale \"%s\".\n"), icu_locale);
}

/* canonicalize to a language tag */
langtag = icu_language_tag(icu_locale);
printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"),
langtag, icu_locale);
pg_free(icu_locale);
icu_locale = langtag;

icu_validate_locale(icu_locale);

/*
Expand Down
2 changes: 1 addition & 1 deletion src/bin/initdb/t/001_initdb.pl
Expand Up @@ -144,7 +144,7 @@
'--locale-provider=icu',
'--icu-locale=@colNumeric=lower', "$tempdir/dataX"
],
qr/could not open collator for locale "\@colNumeric=lower": U_ILLEGAL_ARGUMENT_ERROR/,
qr/could not open collator for locale "und-u-kn-lower": U_ILLEGAL_ARGUMENT_ERROR/,
'fails for invalid collation argument');
}
else
Expand Down
4 changes: 2 additions & 2 deletions src/bin/pg_dump/t/002_pg_dump.pl
Expand Up @@ -1860,9 +1860,9 @@

'CREATE COLLATION icu_collation' => {
create_order => 76,
create_sql => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'C');",
create_sql => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'en-US-u-va-posix');",
regexp =>
qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'C'(, version = '[^']*')?\);/m,
qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'en-US-u-va-posix'(, version = '[^']*')?\);/m,
icu => 1,
like => { %full_runs, section_pre_data => 1, },
},
Expand Down
1 change: 1 addition & 0 deletions src/include/utils/pg_locale.h
Expand Up @@ -120,6 +120,7 @@ extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
size_t srclen, pg_locale_t locale);

extern void icu_validate_locale(const char *loc_str);
extern char *icu_language_tag(const char *loc_str, int elevel);

#ifdef USE_ICU
extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
Expand Down

0 comments on commit ea1db8a

Please sign in to comment.