Skip to content

Commit

Permalink
Use an alias for se / sme, as per #1279
Browse files Browse the repository at this point in the history
For any language with a default code of 3 letters (as per universaldependencies), and an alternate code of 2 letters, we can add that langcode to the resources file to make an alias for people who expect the 2 letter code.

Currently that only applies to se / sme (that we know of, at least)
  • Loading branch information
AngledLuffa committed Sep 10, 2023
1 parent 5b3c8b3 commit 88cd0df
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 7 deletions.
29 changes: 23 additions & 6 deletions stanza/models/common/constant.py
Expand Up @@ -200,7 +200,6 @@
("frr", "North_Frisian"),
("nd", "North_Ndebele"),
("sme", "North_Sami"),
("se", "Northern_Sami"),
("nso", "Northern_Sotho"),
("nb", "Norwegian_Bokmaal"),
("nn", "Norwegian_Nynorsk"),
Expand Down Expand Up @@ -346,20 +345,38 @@
("xh", "xho"),
("yo", "yor"),
("zu", "zul"),

# this is a weird case where a 2 letter code was available,
# but UD used the 3 letter code instead
("se", "sme"),
)

for two, three in two_to_three_letters_raw:
assert two in lcode2lang
assert three not in lcode2lang
assert three not in lang2lcode
lang2lcode[three] = two
lcode2lang[three] = lcode2lang[two]
if two in lcode2lang:
assert two in lcode2lang
assert three not in lcode2lang
assert three not in lang2lcode
lang2lcode[three] = two
lcode2lang[three] = lcode2lang[two]
elif three in lcode2lang:
assert three in lcode2lang
assert two not in lcode2lang
assert two not in lang2lcode
lang2lcode[two] = three
lcode2lang[two] = lcode2lang[three]
else:
raise AssertionError("Found a proposed alias %s -> %s when neither code was already known" % (two, three))

two_to_three_letters = {
two: three for two, three in two_to_three_letters_raw
}

three_to_two_letters = {
three: two for two, three in two_to_three_letters_raw
}

assert len(two_to_three_letters) == len(two_to_three_letters_raw)
assert len(three_to_two_letters) == len(two_to_three_letters_raw)

# additional useful code to language mapping
# added after dict invert to avoid conflict
Expand Down
4 changes: 3 additions & 1 deletion stanza/resources/prepare_resources.py
Expand Up @@ -18,7 +18,7 @@
import zipfile

from stanza import __resources_version__
from stanza.models.common.constant import lcode2lang, two_to_three_letters
from stanza.models.common.constant import lcode2lang, two_to_three_letters, three_to_two_letters
from stanza.resources.default_packages import default_treebanks, no_pretrain_languages, default_pretrains, pos_pretrains, depparse_pretrains, ner_pretrains, default_charlms, pos_charlms, depparse_charlms, ner_charlms, lemma_charlms, known_nicknames

def parse_args():
Expand Down Expand Up @@ -482,6 +482,8 @@ def process_lcode(args):
resources_new[lang_name.lower()] = {'alias': lang.lower()}
if lang.lower() in two_to_three_letters:
resources_new[two_to_three_letters[lang.lower()]] = {'alias': lang.lower()}
elif lang.lower() in three_to_two_letters:
resources_new[three_to_two_letters[lang.lower()]] = {'alias': lang.lower()}
print("Processed lcode aliases. Writing resources.json")
json.dump(resources_new, open(os.path.join(args.output_dir, 'resources.json'), 'w'), indent=2)

Expand Down

0 comments on commit 88cd0df

Please sign in to comment.