Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a Feature of Ignoring some number tokens #74

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 69 additions & 30 deletions number_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def parse_fraction(input_string, language=None):
return None


def parse(input_string, language=None):
def parse(input_string, language=None, ignore=None):
"""
Converts all the numbers in a sentence written in natural language to their numeric type while keeping
the other words unchanged. Returns the transformed string.
Expand Down Expand Up @@ -326,39 +326,78 @@ def _build_and_add_number(pop_last_space=False):
current_sentence.pop()

for token in tokens:
compare_token = _strip_accents(token.lower())
ordinal_number = _is_ordinal_token(compare_token, lang_data)

if not compare_token.strip():
if not tokens_taken:
if ignore:
if token in ignore:
_build_and_add_number()
current_sentence.append(token)
continue
else:
compare_token = _strip_accents(token.lower())
ordinal_number = _is_ordinal_token(compare_token, lang_data)

if not compare_token.strip():
if not tokens_taken:
current_sentence.append(token)
continue

if compare_token in SENTENCE_SEPARATORS:
_build_and_add_number(pop_last_space=True)
current_sentence.append(token)
final_sentence.extend(current_sentence)
current_sentence = []
continue

if ordinal_number:
tokens_taken.append(ordinal_number)
_build_and_add_number(pop_last_space=True)
elif (
_is_cardinal_token(compare_token, lang_data)
or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0)
):
tokens_taken.append(compare_token)
else:
if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data):
# when finishing with a skip_token --> keep it
skip_token = tokens_taken[-1]
tokens_taken.pop()
_build_and_add_number()
current_sentence.extend([skip_token, " "])

_build_and_add_number()
current_sentence.append(token)
else:
compare_token = _strip_accents(token.lower())
ordinal_number = _is_ordinal_token(compare_token, lang_data)

if compare_token in SENTENCE_SEPARATORS:
_build_and_add_number(pop_last_space=True)
current_sentence.append(token)
final_sentence.extend(current_sentence)
current_sentence = []
continue
if not compare_token.strip():
if not tokens_taken:
current_sentence.append(token)
continue

if ordinal_number:
tokens_taken.append(ordinal_number)
_build_and_add_number(pop_last_space=True)
elif (
_is_cardinal_token(compare_token, lang_data)
or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0)
):
tokens_taken.append(compare_token)
else:
if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data):
# when finishing with a skip_token --> keep it
skip_token = tokens_taken[-1]
tokens_taken.pop()
_build_and_add_number()
current_sentence.extend([skip_token, " "])
if compare_token in SENTENCE_SEPARATORS:
_build_and_add_number(pop_last_space=True)
current_sentence.append(token)
final_sentence.extend(current_sentence)
current_sentence = []
continue

if ordinal_number:
tokens_taken.append(ordinal_number)
_build_and_add_number(pop_last_space=True)
elif (
_is_cardinal_token(compare_token, lang_data)
or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0)
):
tokens_taken.append(compare_token)
else:
if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data):
# when finishing with a skip_token --> keep it
skip_token = tokens_taken[-1]
tokens_taken.pop()
_build_and_add_number()
current_sentence.extend([skip_token, " "])

_build_and_add_number()
current_sentence.append(token)
_build_and_add_number()
current_sentence.append(token)

_build_and_add_number()

Expand Down
29 changes: 29 additions & 0 deletions tests/test_number_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,35 @@ def test_parse_sentences_ordinal(expected, test_input, lang):
assert parse(test_input, lang) == expected


@pytest.mark.parametrize(
"test_input,expected,lang,ignore",
[
('fifty fifth sixty seventh', "fifty 5 67", 'en', ['fifty','seven']),
('hundredth and one', "100 and 1", 'en',[]),
('one hundred and forty second', "140 second", 'en', ['second']),
('five thousandth and one', "5000 and one", 'en', ['one']),
# en
('Two thousand sentences', "2 thousand sentences", 'en', ['thousand']),
('twenty one', "20 one", 'en', ['one']),
('I have three apples and one pear.', "I have three apples and 1 pear.", 'en', ['three']),
# numeric
('eleven', "eleven", 'en', ['eleven']),
('ninety thirteen forty', "90 13 forty", 'en', ['forty']),
('one hundred and forty two', "one 140 two", 'en', ['one','two']),
('one hundred and one', "one 100 one", 'en', ['one']),
('seven thousand and nothing else',"seven 1000 and nothing else", 'en', ['seven']),
('five hundred sixty seven thousand twenty four', "five 167020 four", 'en', ['fifty','five','four']),
('one million four hundred twenty-three thousand nine hundred twenty-two', "1000400 twenty-3900 twenty-two", 'en', ['two','twenty']),
('nine hundred ninety-nine thousand nine hundred ninety-nine', "nine 190 nine 1000 nine 190 nine", 'en', ['nine']),
('one million fifty thousand', "1000000 fifty 1000", 'en', ['fifty']),
('two billion one hundred forty seven million four hundred eighty three thousand six hundred forty seven',
"two 1000000000 one 140 seven 1000483 thousand 640 seven", 'en', ['two','thousand','seven','one']),

]
)
def test_parse_including_ignore(expected, test_input, lang, ignore):
assert parse(test_input, lang, ignore) == expected


@pytest.mark.parametrize(
"test_input,expected,lang",
Expand Down