scrapinghub · dhananjaypai08 · Jan 6, 2022 · Jan 6, 2022 · Jan 6, 2022 · Jan 8, 2022
diff --git a/number_parser/parser.py b/number_parser/parser.py
@@ -298,7 +298,7 @@ def parse_fraction(input_string, language=None):
     return None
 
 
-def parse(input_string, language=None):
+def parse(input_string, language=None, ignore=None):
     """
     Converts all the numbers in a sentence written in natural language to their numeric type while keeping
     the other words unchanged. Returns the transformed string.
@@ -326,39 +326,78 @@ def _build_and_add_number(pop_last_space=False):
                 current_sentence.pop()
 
     for token in tokens:
-        compare_token = _strip_accents(token.lower())
-        ordinal_number = _is_ordinal_token(compare_token, lang_data)
-
-        if not compare_token.strip():
-            if not tokens_taken:
+        if ignore:
+            if token in ignore:
+                _build_and_add_number()
                 current_sentence.append(token)
-            continue
+            else:
+                compare_token = _strip_accents(token.lower())
+                ordinal_number = _is_ordinal_token(compare_token, lang_data)
+
+                if not compare_token.strip():
+                    if not tokens_taken:
+                        current_sentence.append(token)
+                    continue
+
+                if compare_token in SENTENCE_SEPARATORS:
+                    _build_and_add_number(pop_last_space=True)
+                    current_sentence.append(token)
+                    final_sentence.extend(current_sentence)
+                    current_sentence = []
+                    continue
+
+                if ordinal_number:
+                    tokens_taken.append(ordinal_number)
+                    _build_and_add_number(pop_last_space=True)
+                elif (
+                        _is_cardinal_token(compare_token, lang_data)
+                        or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0)
+                ):
+                    tokens_taken.append(compare_token)
+                else:
+                    if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data):
+                        # when finishing with a skip_token --> keep it
+                        skip_token = tokens_taken[-1]
+                        tokens_taken.pop()
+                        _build_and_add_number()
+                        current_sentence.extend([skip_token, " "])
+
+                    _build_and_add_number()
+                    current_sentence.append(token)
+        else:
+            compare_token = _strip_accents(token.lower())
+            ordinal_number = _is_ordinal_token(compare_token, lang_data)
 
-        if compare_token in SENTENCE_SEPARATORS:
-            _build_and_add_number(pop_last_space=True)
-            current_sentence.append(token)
-            final_sentence.extend(current_sentence)
-            current_sentence = []
-            continue
+            if not compare_token.strip():
+                if not tokens_taken:
+                    current_sentence.append(token)
+                continue
 
-        if ordinal_number:
-            tokens_taken.append(ordinal_number)
-            _build_and_add_number(pop_last_space=True)
-        elif (
-                _is_cardinal_token(compare_token, lang_data)
-                or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0)
-        ):
-            tokens_taken.append(compare_token)
-        else:
-            if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data):
-                # when finishing with a skip_token --> keep it
-                skip_token = tokens_taken[-1]
-                tokens_taken.pop()
-                _build_and_add_number()
-                current_sentence.extend([skip_token, " "])
+            if compare_token in SENTENCE_SEPARATORS:
+                _build_and_add_number(pop_last_space=True)
+                current_sentence.append(token)
+                final_sentence.extend(current_sentence)
+                current_sentence = []
+                continue
+
+            if ordinal_number:
+                tokens_taken.append(ordinal_number)
+                _build_and_add_number(pop_last_space=True)
+            elif (
+                    _is_cardinal_token(compare_token, lang_data)
+                    or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0)
+            ):
+                tokens_taken.append(compare_token)
+            else:
+                if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data):
+                    # when finishing with a skip_token --> keep it
+                    skip_token = tokens_taken[-1]
+                    tokens_taken.pop()
+                    _build_and_add_number()
+                    current_sentence.extend([skip_token, " "])
 
-            _build_and_add_number()
-            current_sentence.append(token)
+                _build_and_add_number()
+                current_sentence.append(token)
 
     _build_and_add_number()
 

diff --git a/tests/test_number_parsing.py b/tests/test_number_parsing.py
@@ -121,6 +121,35 @@ def test_parse_sentences_ordinal(expected, test_input, lang):
     assert parse(test_input, lang) == expected
 
 
+@pytest.mark.parametrize(
+    "test_input,expected,lang,ignore",
+    [
+        ('fifty fifth sixty seventh', "fifty 5 67", 'en', ['fifty','seven']),
+        ('hundredth and one', "100 and 1", 'en',[]),
+        ('one hundred and forty second', "140 second", 'en', ['second']),
+        ('five thousandth and one', "5000 and one", 'en', ['one']),
+        # en
+        ('Two thousand sentences', "2 thousand sentences", 'en', ['thousand']),
+        ('twenty one', "20 one", 'en', ['one']),
+        ('I have three apples and one pear.', "I have three apples and 1 pear.", 'en', ['three']),
+        # numeric
+        ('eleven', "eleven", 'en', ['eleven']),
+        ('ninety thirteen forty', "90 13 forty", 'en', ['forty']),
+        ('one hundred and forty two', "one 140 two", 'en', ['one','two']),
+        ('one hundred and one', "one 100 one", 'en', ['one']),
+        ('seven thousand and nothing else',"seven 1000 and nothing else", 'en', ['seven']),
+        ('five hundred sixty seven thousand twenty four', "five 167020 four", 'en', ['fifty','five','four']),
+        ('one million four hundred twenty-three thousand nine hundred twenty-two', "1000400 twenty-3900 twenty-two", 'en', ['two','twenty']),
+        ('nine hundred ninety-nine thousand nine hundred ninety-nine', "nine 190 nine 1000 nine 190 nine", 'en', ['nine']),
+        ('one million fifty thousand', "1000000 fifty 1000", 'en', ['fifty']),
+        ('two billion one hundred forty seven million four hundred eighty three thousand six hundred forty seven', 
+         "two 1000000000 one 140 seven 1000483 thousand 640 seven", 'en', ['two','thousand','seven','one']),
+
+    ]
+)
+def test_parse_including_ignore(expected, test_input, lang, ignore):
+    assert parse(test_input, lang, ignore) == expected
+
 
 @pytest.mark.parametrize(
     "test_input,expected,lang",