From 4f55747035ba5cc1e6c52def168c3923a0928c14 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 8 Sep 2017 14:33:47 +0200 Subject: [PATCH] More robust hash collision tests in the FeatureHasher --- .../tests/test_feature_hasher.py | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index d258625897e27..c1afb44eb1ba9 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -112,23 +112,19 @@ def test_hasher_zeros(): @ignore_warnings(category=DeprecationWarning) def test_hasher_alternate_sign(): - # the last two tokens produce a hash collision that sums as 0 - X = [["foo", "bar", "baz", "investigation need", "records"]] + X = [["a", "b", "c", "d", "e", "f", "g", "h"]] Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type='string').fit_transform(X) - assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) - # check that we have a collision that produces a 0 count - assert_true(len(Xt.data) < len(X[0])) - assert_true((Xt.data == 0.).any()) + assert Xt.data.min() < 0 and Xt.data.max() > 0 Xt = FeatureHasher(alternate_sign=True, non_negative=True, input_type='string').fit_transform(X) - assert_true((Xt.data >= 0).all()) # all counts are positive - assert_true((Xt.data == 0.).any()) # we still have a collision + assert Xt.data.min() > 0 + Xt = FeatureHasher(alternate_sign=False, non_negative=True, input_type='string').fit_transform(X) - assert_true((Xt.data > 0).all()) # strictly positive counts + assert Xt.data.min() > 0 Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False, input_type='string').fit_transform(X) # With initially positive features, the non_negative option should @@ -136,6 +132,25 @@ def test_hasher_alternate_sign(): assert_array_equal(Xt.data, Xt_2.data) +@ignore_warnings(category=DeprecationWarning) +def test_hash_collisions(): + X = [["a", "b", "c", "d", "e", "f", "g", "h"]] + + Xt = FeatureHasher(alternate_sign=True, non_negative=False, + n_features=1, input_type='string').fit_transform(X) + # check that some of the hashed tokens are added + # with an opposite sign and cancel out + assert abs(Xt.data[0]) < len(X[0]) + + Xt = FeatureHasher(alternate_sign=True, non_negative=True, + n_features=1, input_type='string').fit_transform(X) + assert abs(Xt.data[0]) < len(X[0]) + + Xt = FeatureHasher(alternate_sign=False, non_negative=True, + n_features=1, input_type='string').fit_transform(X) + assert Xt.data[0] == len(X[0]) + + @ignore_warnings(category=DeprecationWarning) def test_hasher_negative(): X = [{"foo": 2, "bar": -4, "baz": -1}.items()]