diff --git a/number_parser/parser.py b/number_parser/parser.py index e0d67c2..13739d7 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -298,7 +298,7 @@ def parse_fraction(input_string, language=None): return None -def parse(input_string, language=None): +def parse(input_string, language=None, ignore=None): """ Converts all the numbers in a sentence written in natural language to their numeric type while keeping the other words unchanged. Returns the transformed string. @@ -326,39 +326,78 @@ def _build_and_add_number(pop_last_space=False): current_sentence.pop() for token in tokens: - compare_token = _strip_accents(token.lower()) - ordinal_number = _is_ordinal_token(compare_token, lang_data) - - if not compare_token.strip(): - if not tokens_taken: + if ignore: + if token in ignore: + _build_and_add_number() current_sentence.append(token) - continue + else: + compare_token = _strip_accents(token.lower()) + ordinal_number = _is_ordinal_token(compare_token, lang_data) + + if not compare_token.strip(): + if not tokens_taken: + current_sentence.append(token) + continue + + if compare_token in SENTENCE_SEPARATORS: + _build_and_add_number(pop_last_space=True) + current_sentence.append(token) + final_sentence.extend(current_sentence) + current_sentence = [] + continue + + if ordinal_number: + tokens_taken.append(ordinal_number) + _build_and_add_number(pop_last_space=True) + elif ( + _is_cardinal_token(compare_token, lang_data) + or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0) + ): + tokens_taken.append(compare_token) + else: + if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data): + # when finishing with a skip_token --> keep it + skip_token = tokens_taken[-1] + tokens_taken.pop() + _build_and_add_number() + current_sentence.extend([skip_token, " "]) + + _build_and_add_number() + current_sentence.append(token) + else: + compare_token = _strip_accents(token.lower()) + ordinal_number = _is_ordinal_token(compare_token, lang_data) - if compare_token in SENTENCE_SEPARATORS: - _build_and_add_number(pop_last_space=True) - current_sentence.append(token) - final_sentence.extend(current_sentence) - current_sentence = [] - continue + if not compare_token.strip(): + if not tokens_taken: + current_sentence.append(token) + continue - if ordinal_number: - tokens_taken.append(ordinal_number) - _build_and_add_number(pop_last_space=True) - elif ( - _is_cardinal_token(compare_token, lang_data) - or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0) - ): - tokens_taken.append(compare_token) - else: - if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data): - # when finishing with a skip_token --> keep it - skip_token = tokens_taken[-1] - tokens_taken.pop() - _build_and_add_number() - current_sentence.extend([skip_token, " "]) + if compare_token in SENTENCE_SEPARATORS: + _build_and_add_number(pop_last_space=True) + current_sentence.append(token) + final_sentence.extend(current_sentence) + current_sentence = [] + continue + + if ordinal_number: + tokens_taken.append(ordinal_number) + _build_and_add_number(pop_last_space=True) + elif ( + _is_cardinal_token(compare_token, lang_data) + or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0) + ): + tokens_taken.append(compare_token) + else: + if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data): + # when finishing with a skip_token --> keep it + skip_token = tokens_taken[-1] + tokens_taken.pop() + _build_and_add_number() + current_sentence.extend([skip_token, " "]) - _build_and_add_number() - current_sentence.append(token) + _build_and_add_number() + current_sentence.append(token) _build_and_add_number() diff --git a/tests/test_number_parsing.py b/tests/test_number_parsing.py index 2de6d09..965a3e7 100644 --- a/tests/test_number_parsing.py +++ b/tests/test_number_parsing.py @@ -121,6 +121,35 @@ def test_parse_sentences_ordinal(expected, test_input, lang): assert parse(test_input, lang) == expected +@pytest.mark.parametrize( + "test_input,expected,lang,ignore", + [ + ('fifty fifth sixty seventh', "fifty 5 67", 'en', ['fifty','seven']), + ('hundredth and one', "100 and 1", 'en',[]), + ('one hundred and forty second', "140 second", 'en', ['second']), + ('five thousandth and one', "5000 and one", 'en', ['one']), + # en + ('Two thousand sentences', "2 thousand sentences", 'en', ['thousand']), + ('twenty one', "20 one", 'en', ['one']), + ('I have three apples and one pear.', "I have three apples and 1 pear.", 'en', ['three']), + # numeric + ('eleven', "eleven", 'en', ['eleven']), + ('ninety thirteen forty', "90 13 forty", 'en', ['forty']), + ('one hundred and forty two', "one 140 two", 'en', ['one','two']), + ('one hundred and one', "one 100 one", 'en', ['one']), + ('seven thousand and nothing else',"seven 1000 and nothing else", 'en', ['seven']), + ('five hundred sixty seven thousand twenty four', "five 167020 four", 'en', ['fifty','five','four']), + ('one million four hundred twenty-three thousand nine hundred twenty-two', "1000400 twenty-3900 twenty-two", 'en', ['two','twenty']), + ('nine hundred ninety-nine thousand nine hundred ninety-nine', "nine 190 nine 1000 nine 190 nine", 'en', ['nine']), + ('one million fifty thousand', "1000000 fifty 1000", 'en', ['fifty']), + ('two billion one hundred forty seven million four hundred eighty three thousand six hundred forty seven', + "two 1000000000 one 140 seven 1000483 thousand 640 seven", 'en', ['two','thousand','seven','one']), + + ] +) +def test_parse_including_ignore(expected, test_input, lang, ignore): + assert parse(test_input, lang, ignore) == expected + @pytest.mark.parametrize( "test_input,expected,lang",