Skip to content

Commit

Permalink
ICU-21184 rephrase docs/comments using the term grandfathered
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Aug 21, 2020
1 parent cde54fc commit 39da689
Show file tree
Hide file tree
Showing 18 changed files with 138 additions and 287 deletions.
11 changes: 6 additions & 5 deletions icu4c/source/common/locid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1025,13 +1025,14 @@ Locale::forLanguageTag(StringPiece tag, UErrorCode& status)
return result;
}

// If a BCP-47 language tag is passed as the language parameter to the
// If a BCP 47 language tag is passed as the language parameter to the
// normal Locale constructor, it will actually fall back to invoking
// uloc_forLanguageTag() to parse it if it somehow is able to detect that
// the string actually is BCP-47. This works well for things like strings
// using BCP-47 extensions, but it does not at all work for things like
// BCP-47 grandfathered tags (eg. "en-GB-oed") which are possible to also
// interpret as ICU locale IDs and because of that won't trigger the BCP-47
// the string actually is BCP 47. This works well for things like strings
// using BCP 47 extensions, but it does not at all work for things like
// legacy language tags (marked as “Type: grandfathered” in BCP 47,
// e.g., "en-GB-oed") which are possible to also
// interpret as ICU locale IDs and because of that won't trigger the BCP 47
// parsing. Therefore the code here explicitly calls uloc_forLanguageTag()
// and then Locale::init(), instead of just calling the normal constructor.

Expand Down
53 changes: 27 additions & 26 deletions icu4c/source/common/uloc_tag.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ typedef struct ULanguageTag {
VariantListEntry *variants;
ExtensionListEntry *extensions;
const char *privateuse;
const char *grandfathered;
const char *legacy;
} ULanguageTag;

#define MINLEN 2
Expand Down Expand Up @@ -85,8 +85,9 @@ static const char LOCALE_TYPE_YES[] = "yes";
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
This table has 2 parts. The parts for Grandfathered tags is generated by the
following scripts from the IANA language tag registry.
This table has 2 parts. The part for
legacy language tags (marked as “Type: grandfathered” in BCP 47)
is generated by the following scripts from the IANA language tag registry.
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
egrep -A 7 'Type: grandfathered' | \
Expand All @@ -100,8 +101,8 @@ static const char LOCALE_TYPE_YES[] = "yes";
values. They may have to be removed for the strict BCP 47 compliance.
*/
static const char* const GRANDFATHERED[] = {
/* grandfathered preferred */
static const char* const LEGACY[] = {
/* legacy preferred */
"art-lojban", "jbo",
"en-gb-oed", "en-gb-oxendict",
"i-ami", "ami",
Expand All @@ -124,7 +125,7 @@ static const char* const GRANDFATHERED[] = {
"zh-min-nan", "nan",
"zh-xiang", "hsn",

// Grandfathered tags with no preferred value in the IANA
// Legacy tags with no preferred value in the IANA
// registry. Kept for now for the backward compatibility
// because ICU has mapped them this way.
"cel-gaulish", "xtg-x-cel-gaulish",
Expand Down Expand Up @@ -346,7 +347,7 @@ ultag_getPrivateUse(const ULanguageTag* langtag);

#if 0
static const char*
ultag_getGrandfathered(const ULanguageTag* langtag);
ultag_getLegacy(const ULanguageTag* langtag);
#endif

U_NAMESPACE_BEGIN
Expand Down Expand Up @@ -986,7 +987,7 @@ _initializeULanguageTag(ULanguageTag* langtag) {
langtag->variants = NULL;
langtag->extensions = NULL;

langtag->grandfathered = EMPTY;
langtag->legacy = EMPTY;
langtag->privateuse = EMPTY;
}

Expand Down Expand Up @@ -2042,7 +2043,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
char *pExtValueSubtag, *pExtValueSubtagEnd;
int32_t i;
UBool privateuseVar = FALSE;
int32_t grandfatheredLen = 0;
int32_t legacyLen = 0;

if (parsedLen != NULL) {
*parsedLen = 0;
Expand Down Expand Up @@ -2082,25 +2083,25 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
}

size_t parsedLenDelta = 0;
// Grandfathered tag will be consider together. Grandfathered tag with intervening
// Legacy tag will be consider together. Legacy tag with intervening
// script and region such as art-DE-lojban or art-Latn-lojban won't be
// matched.
/* check if the tag is grandfathered */
for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) {
int32_t checkGrandfatheredLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i]));
if (tagLen < checkGrandfatheredLen) {
/* check if the tag is legacy */
for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) {
int32_t checkLegacyLen = static_cast<int32_t>(uprv_strlen(LEGACY[i]));
if (tagLen < checkLegacyLen) {
continue;
}
if (tagLen > checkGrandfatheredLen && tagBuf[checkGrandfatheredLen] != '-') {
if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') {
// make sure next char is '-'.
continue;
}
if (uprv_strnicmp(GRANDFATHERED[i], tagBuf, checkGrandfatheredLen) == 0) {
if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) {
int32_t newTagLength;

grandfatheredLen = checkGrandfatheredLen; /* back up for output parsedLen */
int32_t replacementLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i+1]));
newTagLength = replacementLen + tagLen - checkGrandfatheredLen;
legacyLen = checkLegacyLen; /* back up for output parsedLen */
int32_t replacementLen = static_cast<int32_t>(uprv_strlen(LEGACY[i+1]));
newTagLength = replacementLen + tagLen - checkLegacyLen;
if (tagLen < newTagLength) {
uprv_free(tagBuf);
tagBuf = (char*)uprv_malloc(newTagLength + 1);
Expand All @@ -2111,16 +2112,16 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
t->buf = tagBuf;
tagLen = newTagLength;
}
parsedLenDelta = checkGrandfatheredLen - replacementLen;
uprv_strcpy(t->buf, GRANDFATHERED[i + 1]);
if (checkGrandfatheredLen != tagLen) {
uprv_strcpy(t->buf + replacementLen, tag + checkGrandfatheredLen);
parsedLenDelta = checkLegacyLen - replacementLen;
uprv_strcpy(t->buf, LEGACY[i + 1]);
if (checkLegacyLen != tagLen) {
uprv_strcpy(t->buf + replacementLen, tag + checkLegacyLen);
}
break;
}
}

if (grandfatheredLen == 0) {
if (legacyLen == 0) {
for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
const char* redundantTag = REDUNDANT[i];
size_t redundantTagLen = uprv_strlen(redundantTag);
Expand Down Expand Up @@ -2608,8 +2609,8 @@ ultag_getPrivateUse(const ULanguageTag* langtag) {

#if 0
static const char*
ultag_getGrandfathered(const ULanguageTag* langtag) {
return langtag->grandfathered;
ultag_getLegacy(const ULanguageTag* langtag) {
return langtag->legacy;
}
#endif

Expand Down
16 changes: 10 additions & 6 deletions icu4c/source/common/ulocimp.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,17 @@ ulocimp_toLanguageTag(const char* localeID,
* If the specified language tag contains any ill-formed subtags,
* the first such subtag and all following subtags are ignored.
* <p>
* This implements the 'Language-Tag' production of BCP47, and so
* supports grandfathered (regular and irregular) as well as private
* use language tags. Private use tags are represented as 'x-whatever',
* and grandfathered tags are converted to their canonical replacements
* where they exist. Note that a few grandfathered tags have no modern
* replacement, these will be converted using the fallback described in
* This implements the 'Language-Tag' production of BCP 47, and so
* supports legacy language tags (marked as “Type: grandfathered” in BCP 47)
* (regular and irregular) as well as private use language tags.
*
* Private use tags are represented as 'x-whatever',
* and legacy tags are converted to their canonical replacements where they exist.
*
* Note that a few legacy tags have no modern replacement;
* these will be converted using the fallback described in
* the first paragraph, so some information might be lost.
*
* @param langtag the input BCP47 language tag.
* @param tagLen the length of langtag, or -1 to call uprv_strlen().
* @param sink the output sink receiving a locale ID for the
Expand Down
11 changes: 6 additions & 5 deletions icu4c/source/common/unicode/localebuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,12 @@ class U_COMMON_API LocaleBuilder : public UObject {
/**
* Resets the LocaleBuilder to match the provided
* [Unicode Locale Identifier](http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_id) .
* Discards the existing state. the empty string cause the builder to be
* reset, like {@link #clear}. Grandfathered tags are converted to their
* canonical form before being processed. Otherwise, the <code>language
* tag</code> must be well-formed, or else the build() method will later
* report an U_ILLEGAL_ARGUMENT_ERROR.
* Discards the existing state.
* The empty string causes the builder to be reset, like {@link #clear}.
* Legacy language tags (marked as “Type: grandfathered” in BCP 47)
* are converted to their canonical form before being processed.
* Otherwise, the <code>language tag</code> must be well-formed,
* or else the build() method will later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>This method clears the internal UErrorCode.
*
Expand Down
16 changes: 10 additions & 6 deletions icu4c/source/common/unicode/locid.h
Original file line number Diff line number Diff line change
Expand Up @@ -393,13 +393,17 @@ class U_COMMON_API Locale : public UObject {
* If the specified language tag contains any ill-formed subtags,
* the first such subtag and all following subtags are ignored.
* <p>
* This implements the 'Language-Tag' production of BCP47, and so
* supports grandfathered (regular and irregular) as well as private
* use language tags. Private use tags are represented as 'x-whatever',
* and grandfathered tags are converted to their canonical replacements
* where they exist. Note that a few grandfathered tags have no modern
* replacement, these will be converted using the fallback described in
* This implements the 'Language-Tag' production of BCP 47, and so
* supports legacy language tags (marked as “Type: grandfathered” in BCP 47)
* (regular and irregular) as well as private use language tags.
*
* Private use tags are represented as 'x-whatever',
* and legacy tags are converted to their canonical replacements where they exist.
*
* Note that a few legacy tags have no modern replacement;
* these will be converted using the fallback described in
* the first paragraph, so some information might be lost.
*
* @param tag the input BCP47 language tag.
* @param status error information if creating the Locale failed.
* @return the Locale for the specified BCP47 language tag.
Expand Down
18 changes: 11 additions & 7 deletions icu4c/source/common/unicode/uloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -1237,14 +1237,18 @@ uloc_minimizeSubtags(const char* localeID,
* Returns a locale ID for the specified BCP47 language tag string.
* If the specified language tag contains any ill-formed subtags,
* the first such subtag and all following subtags are ignored.
* <p>
* This implements the 'Language-Tag' production of BCP47, and so
* supports grandfathered (regular and irregular) as well as private
* use language tags. Private use tags are represented as 'x-whatever',
* and grandfathered tags are converted to their canonical replacements
* where they exist. Note that a few grandfathered tags have no modern
* replacement, these will be converted using the fallback described in
* <p>
* This implements the 'Language-Tag' production of BCP 47, and so
* supports legacy language tags (marked as “Type: grandfathered” in BCP 47)
* (regular and irregular) as well as private use language tags.
*
* Private use tags are represented as 'x-whatever',
* and legacy tags are converted to their canonical replacements where they exist.
*
* Note that a few legacy tags have no modern replacement;
* these will be converted using the fallback described in
* the first paragraph, so some information might be lost.
*
* @param langtag the input BCP47 language tag.
* @param localeID the output buffer receiving a locale ID for the
* specified BCP47 language tag.
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/i18n/calendar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ static ECalType getCalendarTypeForLocale(const char *locid) {
//TODO: ULOC_FULL_NAME is out of date and too small..
char canonicalName[256];

// canonicalize, so grandfathered variant will be transformed to keywords
// Canonicalize, so that an old-style variant will be transformed to keywords.
// e.g ja_JP_TRADITIONAL -> ja_JP@calendar=japanese
// NOTE: Since ICU-20187, ja_JP_TRADITIONAL no longer canonicalizes, and
// the Gregorian calendar is returned instead.
Expand Down
2 changes: 0 additions & 2 deletions icu4c/source/i18n/fmtable_cnv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ U_NAMESPACE_BEGIN
// -------------------------------------
// Creates a formattable object with a char* string.
// This API is useless. The API that takes a UnicodeString is actually just as good.
// This is just a grandfathered API.

Formattable::Formattable(const char* stringToCopy)
{
init();
Expand Down
6 changes: 3 additions & 3 deletions icu4c/source/test/testdata/localeMatcherTest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ und-TW >> zh-Hant
zh-Hant >> und-TW
zh >> und-TW

** test: testMatchGrandfatheredCode
** test: testMatchLegacyCode

@supported=fr, i-klingon, en-Latn-US
en-GB-oed >> en-Latn-US
Expand Down Expand Up @@ -984,7 +984,7 @@ x-bork >> x-bork
x-piglatin >> x-bork
x-bork >> x-bork

** test: MatchGrandfatheredCode
** test: MatchLegacyCode
@supported=fr, i-klingon, en-Latn-US
en-GB-oed >> en-Latn-US
i-klingon >> tlh
Expand Down Expand Up @@ -1525,7 +1525,7 @@ en >> null
x-piglatin >> fr
x-bork >> x-bork

** test: grandfathered codes
** test: legacy codes
@supported=fr, i-klingon, en-Latn-US
en-GB-oed >> en-Latn-US
i-klingon >> tlh
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ public class LanguageTag {
private List<String> _variants = Collections.emptyList(); // variant subtags
private List<String> _extensions = Collections.emptyList(); // extensions

// Map contains grandfathered tags and its preferred mappings from
// http://www.ietf.org/rfc/rfc5646.txt
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED =
// The Map contains legacy language tags (marked as “Type: grandfathered” in BCP 47)
// and their preferred mappings from BCP 47.
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> LEGACY =
new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>();

static {
// grandfathered = irregular ; non-redundant tags registered
// legacy = irregular ; non-redundant tags registered
// / regular ; during the RFC 3066 era
//
// irregular = "en-GB-oed" ; irregular tags do not match
Expand Down Expand Up @@ -105,57 +105,17 @@ public class LanguageTag {
{"zh-xiang", "hsn"},
};
for (String[] e : entries) {
GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
LEGACY.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
}
}

private LanguageTag() {
}

/*
* BNF in RFC5464
*
* Language-Tag = langtag ; normal language tags
* / privateuse ; private use tag
* / grandfathered ; grandfathered tags
*
*
* langtag = language
* ["-" script]
* ["-" region]
* *("-" variant)
* *("-" extension)
* ["-" privateuse]
*
* language = 2*3ALPHA ; shortest ISO 639 code
* ["-" extlang] ; sometimes followed by
* ; extended language subtags
* / 4ALPHA ; or reserved for future use
* / 5*8ALPHA ; or registered language subtag
*
* extlang = 3ALPHA ; selected ISO 639 codes
* *2("-" 3ALPHA) ; permanently reserved
*
* script = 4ALPHA ; ISO 15924 code
*
* region = 2ALPHA ; ISO 3166-1 code
* / 3DIGIT ; UN M.49 code
*
* variant = 5*8alphanum ; registered variants
* / (DIGIT 3alphanum)
*
* extension = singleton 1*("-" (2*8alphanum))
*
* ; Single alphanumerics
* ; "x" reserved for private use
* singleton = DIGIT ; 0 - 9
* / %x41-57 ; A - W
* / %x59-5A ; Y - Z
* / %x61-77 ; a - w
* / %x79-7A ; y - z
*
* privateuse = "x" 1*("-" (1*8alphanum))
*
/**
* See BCP 47 “Tags for Identifying Languages”:
* https://www.rfc-editor.org/info/bcp47 -->
* https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1
*/
public static LanguageTag parse(String languageTag, ParseStatus sts) {
if (sts == null) {
Expand All @@ -166,8 +126,7 @@ public static LanguageTag parse(String languageTag, ParseStatus sts) {

StringTokenIterator itr;

// Check if the tag is grandfathered
String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
String[] gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
if (gfmap != null) {
// use preferred mapping
itr = new StringTokenIterator(gfmap[1], SEP);
Expand Down
Loading

0 comments on commit 39da689

Please sign in to comment.