Skip to content

Commit

Permalink
Fix a bug in strip_accents_unicode; #15087
Browse files Browse the repository at this point in the history
strip_accents_unicode contained a check to see if applying NFKD
normalization to the input string changed it. If the string was
unchanged, then it would not attempt to remove accents. This meant
that if an input string was already in NFKD form and also contained
accents, the accents were not removed.

Now, strip_accents_unicode always filters out combining characters
after applying NFKD normalization.
  • Loading branch information
Daniel Grady committed Sep 26, 2019
1 parent e424ab1 commit 2f874a8
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 4 deletions.
15 changes: 15 additions & 0 deletions sklearn/feature_extraction/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,21 @@ def test_strip_accents():
expected = 'this is a test'
assert strip_accents_unicode(a) == expected

# strings that are already decomposed
a = "o\u0308" # o with diaresis
expected = "o"
assert strip_accents_unicode(a) == expected

# combining marks by themselves
a = "\u0300\u0301\u0302\u0303"
expected = ""
assert strip_accents_unicode(a) == expected

# Multiple combining marks on one character
a = "o\u0308\u0304"
expected = "o"
assert strip_accents_unicode(a) == expected


def test_to_ascii():
# check some classical latin accentuated symbols
Expand Down
5 changes: 1 addition & 4 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,7 @@ def strip_accents_unicode(s):
ASCII equivalent.
"""
normalized = unicodedata.normalize('NFKD', s)
if normalized == s:
return s
else:
return ''.join([c for c in normalized if not unicodedata.combining(c)])
return ''.join([c for c in normalized if not unicodedata.combining(c)])


def strip_accents_ascii(s):
Expand Down

0 comments on commit 2f874a8

Please sign in to comment.