Fix a bug in strip_accents_unicode; #15087

strip_accents_unicode contained a check to see if applying NFKD normalization to the input string changed it. If the string was unchanged, then it would not attempt to remove accents. This meant that if an input string was already in NFKD form and also contained accents, the accents were not removed. Now, strip_accents_unicode always filters out combining characters after applying NFKD normalization.
scikit-learn · Sep 30, 2019 · bb9e237 · bb9e237
1 parent e424ab1
commit bb9e237
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 3 deletions.
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -255,6 +255,10 @@ Changelog
   removed in v0.24. :pr:`14520` by
   :user:`Guillem G. Subies <guillemgsubies>`.
 
+- |Fix| :func:`feature_extraction.text.strip_accents_unicode` now correctly
+  removes accents from strings that are in NFKD normalized form. :pr:`15100` by
+  :user:`Daniel Grady <DGrady>`.
+
 :mod:`sklearn.feature_selection`
 ................................
 

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -97,6 +97,21 @@ def test_strip_accents():
     expected = 'this is a test'
     assert strip_accents_unicode(a) == expected
 
+    # strings that are already decomposed
+    a = "o\u0308"  # o with diaresis
+    expected = "o"
+    assert strip_accents_unicode(a) == expected
+
+    # combining marks by themselves
+    a = "\u0300\u0301\u0302\u0303"
+    expected = ""
+    assert strip_accents_unicode(a) == expected
+
+    # Multiple combining marks on one character
+    a = "o\u0308\u0304"
+    expected = "o"
+    assert strip_accents_unicode(a) == expected
+
 
 def test_to_ascii():
     # check some classical latin accentuated symbols

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -129,10 +129,13 @@ def strip_accents_unicode(s):
         Remove accentuated char for any unicode symbol that has a direct
         ASCII equivalent.
     """
-    normalized = unicodedata.normalize('NFKD', s)
-    if normalized == s:
+    try:
+        # If `s` is ASCII-compatible, then it does not contain any accented
+        # characters and we can avoid an expensive list comprehension
+        s.encode("ASCII", errors="strict")
         return s
-    else:
+    except UnicodeEncodeError:
+        normalized = unicodedata.normalize('NFKD', s)
         return ''.join([c for c in normalized if not unicodedata.combining(c)])