Skip to content

Commit

Permalink
Prevent ReDoS in Spanish sentence splitting regex (#1084)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sjord committed Jan 11, 2023
1 parent 1db73ee commit 769e4c0
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
6 changes: 2 additions & 4 deletions dateparser/languages/locale.py
Expand Up @@ -263,7 +263,7 @@ def _sentence_split(self, string, settings):

splitters_dict = {1: r'[\.!?;…\r\n]+(?:\s|$)*', # most European, Tagalog, Hebrew, Georgian,
# Indonesian, Vietnamese
2: r'(?:[¡¿]+|[\.!?;…\r\n]+(?:\s|$))+', # Spanish
2: r'[\.!?;…\r\n]+(\s*[¡¿]*|$)|[¡¿]+', # Spanish
3: r'[|!?;\r\n]+(?:\s|$)+', # Hindi and Bangla
4: r'[。…‥\.!??!;\r\n]+(?:\s|$)+', # Japanese and Chinese
5: r'[\r\n]+', # Thai
Expand All @@ -275,9 +275,7 @@ def _sentence_split(self, string, settings):
split_reg = abbreviation_string + splitters_dict[self.info['sentence_splitter_group']]
sentences = re.split(split_reg, string)

for i in sentences:
if not i:
sentences.remove(i)
sentences = filter(None, sentences)
return sentences

def _simplify_split_align(self, original, settings):
Expand Down
5 changes: 5 additions & 0 deletions tests/test_search.py
Expand Up @@ -416,6 +416,10 @@ def test_search_date_string(self, shortname, datetime_string):
('de 1941', datetime.datetime(1941, 1, 1, 0, 0))],
settings={'RELATIVE_BASE': datetime.datetime(2000, 1, 1)}),
param('es', '¡¡Ay!! En Madrid, a 17 de marzo de 1615. ¿Vos bueno?',
[('a 17 de marzo de 1615', datetime.datetime(1615, 3, 17, 0, 0))],
settings={'RELATIVE_BASE': datetime.datetime(2000, 1, 1)}),
# Swedish
param('sv', 'Efter kommunisternas seger 1922 drog de allierade och Japan bort sina trupper.',
[('1922', datetime.datetime(1922, 1, 1, 0, 0))],
Expand Down Expand Up @@ -657,6 +661,7 @@ def test_splitting_of_not_parsed(self, shortname, string, expected, settings=Non
# Spanish
param('es', '11 junio 2010'),
param('es', '¡¡Ay!! En Madrid, a 17 de marzo de 1615. ¿Vos bueno?'),
# Swedish
param('sv', ' den 15 augusti 1945 då Kejsardömet'),
Expand Down

0 comments on commit 769e4c0

Please sign in to comment.