Fix a bug in strip_accents_unicode; #15087

strip_accents_unicode contained a check to see if applying NFKD normalization to the input string changed it. If the string was unchanged, then it would not attempt to remove accents. This meant that if an input string was already in NFKD form and also contained accents, the accents were not removed. Now, strip_accents_unicode always filters out combining characters after applying NFKD normalization.
scikit-learn · Sep 26, 2019 · 2f874a8 · 2f874a8
1 parent e424ab1
commit 2f874a8
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 4 deletions.
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -97,6 +97,21 @@ def test_strip_accents():
     expected = 'this is a test'
     assert strip_accents_unicode(a) == expected
 
+    # strings that are already decomposed
+    a = "o\u0308"  # o with diaresis
+    expected = "o"
+    assert strip_accents_unicode(a) == expected
+
+    # combining marks by themselves
+    a = "\u0300\u0301\u0302\u0303"
+    expected = ""
+    assert strip_accents_unicode(a) == expected
+
+    # Multiple combining marks on one character
+    a = "o\u0308\u0304"
+    expected = "o"
+    assert strip_accents_unicode(a) == expected
+
 
 def test_to_ascii():
     # check some classical latin accentuated symbols

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -130,10 +130,7 @@ def strip_accents_unicode(s):
         ASCII equivalent.
     """
     normalized = unicodedata.normalize('NFKD', s)
-    if normalized == s:
-        return s
-    else:
-        return ''.join([c for c in normalized if not unicodedata.combining(c)])
+    return ''.join([c for c in normalized if not unicodedata.combining(c)])
 
 
 def strip_accents_ascii(s):