Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLDR script update #487

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions scripts/order_languages.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
# -*- coding: utf-8 -*-
import regex as re
import json
import os
from collections import OrderedDict

from utils import get_raw_data
import regex as re

from utils import AVOID_LANGUAGES, get_raw_data

os.chdir(os.path.dirname(os.path.abspath(__file__)))
get_raw_data()

# Languages with insufficient translation data are excluded
avoid_languages = ['cu', 'kkj', 'nds', 'prg', 'tk', 'vai', 'vai-Latn', 'vai-Vaii', 'vo']


def _get_language_locale_dict():
cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/"
Expand All @@ -26,7 +24,7 @@ def _get_language_locale_dict():
if re.match(language_name + '-[A-Z0-9]+$', locale_name):
language_locale_dict[language_name].append(locale_name)

for language in avoid_languages:
for language in AVOID_LANGUAGES:
del language_locale_dict[language]
return language_locale_dict

Expand Down
33 changes: 30 additions & 3 deletions scripts/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# -*- coding: utf-8 -*-
import os
from collections import OrderedDict

from git import Repo
import os

# Languages with insufficient translation data are excluded
# TODO: Automate with exclusion criteria.
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
AVOID_LANGUAGES = {'cu', 'kkj', 'nds', 'prg', 'tk', 'vai', 'vai-Latn', 'vai-Vaii', 'vo'}


def get_raw_data():
Expand All @@ -27,18 +32,26 @@ def get_dict_difference(parent_dict, child_dict):
if not parent_value:
child_specific_value = child_value
elif isinstance(child_value, list):
child_specific_value = list(set(child_value)-set(parent_value))
child_specific_value = list(
set(map(str.lower, map(str, child_value))) -
set(map(str.lower, map(str, parent_value)))
)
elif isinstance(child_value, dict):
child_specific_value = get_dict_difference(parent_value, child_value)
elif child_value != parent_value:
elif child_value.lower() != parent_value.lower():
child_specific_value = child_value
if child_specific_value:
difference_dict[key] = child_specific_value
return difference_dict


def combine_dicts(primary_dict, supplementary_dict):
if not primary_dict:
return supplementary_dict
elif not supplementary_dict:
return primary_dict
Comment on lines +49 to +52
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if not primary_dict:
return supplementary_dict
elif not supplementary_dict:
return primary_dict
if not primary_dict or not supplementary_dict:
return primary_dict or supplementary_dict

this is simpler, but I'm not sure about the readability of my code 🤔

combined_dict = OrderedDict()
filter_locales(primary_dict, supplementary_dict)
for key, value in primary_dict.items():
if key in supplementary_dict:
if isinstance(value, list):
Expand All @@ -53,3 +66,17 @@ def combine_dicts(primary_dict, supplementary_dict):
for key in remaining_keys:
combined_dict[key] = supplementary_dict[key]
return combined_dict


def filter_locales(primary_dict, supplementary_dict):
if not primary_dict.get('locale_specific'):
return
for locale, locale_data in primary_dict['locale_specific'].items():
diff = get_dict_difference(supplementary_dict, locale_data)
primary_dict['locale_specific'][locale] = diff
if diff.keys() == ['name']:
del primary_dict['locale_specific'][locale]


def language_from_filename(filename):
return filename.split('.')[0]
36 changes: 19 additions & 17 deletions scripts/write_complete_data.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# -*- coding: utf-8 -*-
import json
from ruamel.yaml import RoundTripLoader
import logging
import os
import shutil
from collections import OrderedDict

import regex as re
from ruamel.yaml import RoundTripLoader

from utils import combine_dicts
from utils import AVOID_LANGUAGES, combine_dicts, language_from_filename

cldr_date_directory = '../dateparser_data/cldr_language_data/date_translation_data/'
cldr_numeral_directory = '../dateparser_data/cldr_language_data/numeral_translation_data/'
Expand All @@ -17,18 +19,20 @@
numeral_translation_directory = '../dateparser/data/numeral_translation_data/'

os.chdir(os.path.dirname(os.path.abspath(__file__)))
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
log = logging.getLogger('data_scripts')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why did you add the log here?


# Languages with insufficient translation data are excluded
# TODO: Automate with exclusion criteria.
avoid_languages = {'cu', 'kkj', 'nds', 'prg', 'tk', 'vai', 'vai-Latn', 'vai-Vaii', 'vo'}

cldr_languages = list(set(map(lambda x: x[:-5], os.listdir(cldr_date_directory))) - avoid_languages)
supplementary_languages = list(map(lambda x: x[:-5], os.listdir(supplementary_date_directory)))
all_languages = set(cldr_languages).union(set(supplementary_languages))
cldr_languages = set([language_from_filename(filename) for filename
in os.listdir(cldr_date_directory)]) - AVOID_LANGUAGES
supplementary_languages = set([language_from_filename(filename) for filename
in os.listdir(supplementary_date_directory)])
all_languages = cldr_languages.union(supplementary_languages)

cldr_numeral_languages = list(map(lambda x: x[:-5], os.listdir(cldr_numeral_directory)))
cldr_numeral_languages = [language_from_filename(filename) for filename
in os.listdir(cldr_numeral_directory)]

RELATIVE_PATTERN = re.compile(r'\{0\}')
encoding_comment = "# -*- coding: utf-8 -*-\n"


def _modify_relative_data(relative_data):
Expand All @@ -38,16 +42,15 @@ def _modify_relative_data(relative_data):
string = RELATIVE_PATTERN.sub(r'(\\d+)', string)
value[i] = string
modified_relative_data[key] = value
return modified_relative_data
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is not going to return the modified_relative_data dict we don't even need to create it because we don't need to track it.

suggestion: deleting modified_relative_data = OrderedDict() and modified_relative_data[key] = value.



def _modify_data(language_data):
relative_data = language_data.get("relative-type-regex", {})
relative_data = _modify_relative_data(relative_data)
_modify_relative_data(relative_data)
locale_specific_data = language_data.get("locale_specific", {})
for _, info in locale_specific_data.items():
locale_relative_data = info.get("relative-type-regex", {})
locale_relative_data = _modify_relative_data(locale_relative_data)
_modify_relative_data(locale_relative_data)


def _get_complete_date_translation_data(language):
Expand All @@ -66,7 +69,6 @@ def _get_complete_date_translation_data(language):


def main():
encoding_comment = "# -*- coding: utf-8 -*-\n"
if not os.path.isdir(translation_data_directory):
os.mkdir(translation_data_directory)
if os.path.isdir(date_translation_directory):
Expand Down Expand Up @@ -98,9 +100,9 @@ def main():
out.write(out_text)

init_text = '\n'.join(
["from dateparser.data import date_translation_data, numeral_translation_data",
"from .languages_info import language_order, language_locale_dict"]
)
["from dateparser.data import date_translation_data, numeral_translation_data",
"from .languages_info import language_order, language_locale_dict"]
)
with open(translation_data_directory + '__init__.py', 'w') as out:
out.write(encoding_comment + init_text)

Expand Down