Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] Fix iforest average path length #13251

Merged
merged 13 commits into from Feb 26, 2019
Copy path View file
@@ -439,6 +439,8 @@ def _average_path_length(n_samples_leaf):
"""
if isinstance(n_samples_leaf, INTEGER_TYPES):
if n_samples_leaf <= 1:
return 0.
This conversation was marked as resolved by glemaitre

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor

I would find a bit less confusing this written with a if/elif statement:

if n_samples_leaf <= 1:
    return 0.
elif n_samples_leaf == 2:
    return 1.
else:
    ...

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor

It will be equivalent of what is written in line 457

if n_samples_leaf <= 2:
return 1.
else:
return 2. * (np.log(n_samples_leaf - 1.) + np.euler_gamma) - 2. * (
@@ -450,10 +452,12 @@ def _average_path_length(n_samples_leaf):
n_samples_leaf = n_samples_leaf.reshape((1, -1))
average_path_length = np.zeros(n_samples_leaf.shape)

mask = (n_samples_leaf <= 1)
not_mask = np.logical_not(mask)
mask_1 = (n_samples_leaf <= 1)
This conversation was marked as resolved by albertcthomas

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
Suggested change
mask_1 = (n_samples_leaf <= 1)
mask_1 = n_samples_leaf <= 1
mask_2 = (n_samples_leaf == 2)
This conversation was marked as resolved by albertcthomas

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
Suggested change
mask_2 = (n_samples_leaf == 2)
mask_2 = n_samples_leaf == 2
not_mask = np.logical_not(np.logical_or(mask_1, mask_2))
This conversation was marked as resolved by albertcthomas

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
Suggested change
not_mask = np.logical_not(np.logical_or(mask_1, mask_2))
not_mask = ~np.logical_or(mask_1, mask_2)

average_path_length[mask] = 1.
average_path_length[mask_1] = 0.
average_path_length[mask_2] = 1.
average_path_length[not_mask] = 2. * (
np.log(n_samples_leaf[not_mask] - 1.) + np.euler_gamma) - 2. * (
n_samples_leaf[not_mask] - 1.) / n_samples_leaf[not_mask]
@@ -262,14 +262,17 @@ def test_iforest_subsampled_features():
def test_iforest_average_path_length():
# It tests non-regression for #8549 which used the wrong formula
# for average path length, strictly for the integer case
# Updated to check average path length when input is <= 2 (issue #11839)

result_one = 2. * (np.log(4.) + np.euler_gamma) - 2. * 4. / 5.
result_two = 2. * (np.log(998.) + np.euler_gamma) - 2. * 998. / 999.
assert_almost_equal(_average_path_length(1), 1., decimal=10)
assert _average_path_length(0) == 0.
This conversation was marked as resolved by albertcthomas

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
Suggested change
assert _average_path_length(0) == 0.
assert _average_path_length(0) == pytest.approx(0)
assert _average_path_length(1) == 0.
This conversation was marked as resolved by albertcthomas

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
Suggested change
assert _average_path_length(1) == 0.
assert _average_path_length(1) == pytest.approx(0)
assert _average_path_length(2) == 1.

This comment has been minimized.

Copy link
@ngoix

ngoix Feb 25, 2019

Contributor

can we also test that _average_path_length is increasing for more values? I guess _average_path_length(2) < _average_path_length(3) would be enough

This conversation was marked as resolved by albertcthomas

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
Suggested change
assert _average_path_length(2) == 1.
assert _average_path_length(2) == pytest.approx(1)
assert_almost_equal(_average_path_length(5), result_one, decimal=10)
assert_almost_equal(_average_path_length(999), result_two, decimal=10)
assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])),
[1., result_one, result_two], decimal=10)
assert_array_almost_equal(_average_path_length(np.array([1, 2, 5, 999])),

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor

Since we are changing this line, could we use assert_allclose

This comment has been minimized.

Copy link
@albertcthomas

albertcthomas Feb 26, 2019

Author Contributor

Yes. What's the difference? assert_allclose does not check the shapes are the same?

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor

all_close use rtol atol instead of decimal. It is just recommended by numpy for consistency:
https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_array_almost_equal.html

[0., 1., result_one, result_two], decimal=10)


@pytest.mark.filterwarnings('ignore:default contamination')
@@ -21,6 +21,7 @@
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_raises_regex
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.estimator_checks import check_outlier_corruption

from sklearn.datasets import load_iris

@@ -252,3 +253,20 @@ def test_contamination_future_warning():
'default contamination parameter 0.1 will change '
'in version 0.22 to "auto"',
neighbors.LocalOutlierFactor().fit, X)


def test_predicted_outlier_number():
# the number of predicted outliers should be equal to the number of
# expected outliers unless there are ties in the abnormality scores.
X = iris.data
n_samples = X.shape[0]
expected_outliers = 30
contamination = float(expected_outliers)/n_samples

clf = neighbors.LocalOutlierFactor(contamination=contamination)
y_pred = clf.fit_predict(X)

num_outliers = np.sum(y_pred != 1)
if num_outliers != expected_outliers:
y_dec = clf.negative_outlier_factor_
check_outlier_corruption(num_outliers, expected_outliers, y_dec)
@@ -18,7 +18,6 @@
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_not_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_in
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
@@ -1525,8 +1524,29 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))


def check_outlier_corruption(num_outliers, expected_outliers, decision):
# Check for deviation from the precise given contamination level that may
# be due to ties in the anomaly scores.
if num_outliers < expected_outliers:
start = num_outliers
end = expected_outliers + 1
else:
start = expected_outliers
end = num_outliers + 1

# ensure that all values in the 'critical area' are tied,
# leading to the observed discrepancy between provided
# and actual contamination levels.
sorted_decision = np.sort(decision)
msg = ('The number of predicted outliers is not equal to the expected '
'number of outliers and this difference is not explained by the '
'number of ties is the decision_function values')
This conversation was marked as resolved by albertcthomas

This comment has been minimized.

Copy link
@albertcthomas

albertcthomas Feb 26, 2019

Author Contributor

is -> in

assert len(np.unique(sorted_decision[start:end])) == 1, msg


def check_outliers_train(name, estimator_orig, readonly_memmap=True):
X, _ = make_blobs(n_samples=300, random_state=0)
n_samples = 300
X, _ = make_blobs(n_samples=n_samples, random_state=0)
X = shuffle(X, random_state=7)

if readonly_memmap:
@@ -1548,16 +1568,16 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):

decision = estimator.decision_function(X)

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
for func in ['decision_function', 'score_samples']:
    output = getattr(estimator, func)(X)
    assert output.dtype == np.dtype('float')
    assert output.shape == (n_samples,)

This comment has been minimized.

Copy link
@albertcthomas

albertcthomas Feb 26, 2019

Author Contributor

I now use a for loop but a bit different to your suggestion as we need the outputs for other checks.

assert decision.dtype == np.dtype('float')
assert decision.shape == (n_samples,)

score = estimator.score_samples(X)
assert score.dtype == np.dtype('float')
scores = estimator.score_samples(X)
assert scores.dtype == np.dtype('float')
assert scores.shape == (n_samples,)

# raises error on malformed input for predict
assert_raises(ValueError, estimator.predict, X.T)

# decision_function agrees with predict
decision = estimator.decision_function(X)
assert decision.shape == (n_samples,)
dec_pred = (decision >= 0).astype(np.int)
dec_pred[dec_pred == 0] = -1
assert_array_equal(dec_pred, y_pred)
@@ -1566,9 +1586,7 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
assert_raises(ValueError, estimator.decision_function, X.T)

# decision_function is a translation of score_samples
y_scores = estimator.score_samples(X)
assert y_scores.shape == (n_samples,)
y_dec = y_scores - estimator.offset_
y_dec = scores - estimator.offset_
assert_allclose(y_dec, decision)

# raises error on malformed input for score_samples
@@ -1581,11 +1599,21 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
# set to 'auto'. This is true for the training set and cannot thus be
# checked as follows for estimators with a novelty parameter such as
# LocalOutlierFactor (tested in check_outliers_fit_predict)
contamination = 0.1
expected_outliers = 30
contamination = float(expected_outliers)/n_samples
This conversation was marked as resolved by albertcthomas

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
Suggested change
contamination = float(expected_outliers)/n_samples
contamination = expected_outliers / n_samples
estimator.set_params(contamination=contamination)
estimator.fit(X)
y_pred = estimator.predict(X)
assert_almost_equal(np.mean(y_pred != 1), contamination)

num_outliers = np.sum(y_pred != 1)
# num_outliers should be equal to expected_outliers unless
# there are ties in the decision_function values. this can
# only be tested for estimators with a decision_function
# method, i.e. all estimators except LOF which is already
# excluded from this if branch.
if num_outliers != expected_outliers:
decision = estimator.decision_function(X)
check_outlier_corruption(num_outliers, expected_outliers, decision)

# raises error when contamination is a scalar and not in [0,1]
for contamination in [-0.5, 2.3]:
@@ -2356,7 +2384,8 @@ def check_decision_proba_consistency(name, estimator_orig):
def check_outliers_fit_predict(name, estimator_orig):
# Check fit_predict for outlier detectors.

X, _ = make_blobs(n_samples=300, random_state=0)
n_samples = 300
X, _ = make_blobs(n_samples=n_samples, random_state=0)
X = shuffle(X, random_state=7)
n_samples, n_features = X.shape
estimator = clone(estimator_orig)
@@ -2378,10 +2407,20 @@ def check_outliers_fit_predict(name, estimator_orig):
if hasattr(estimator, "contamination"):
# proportion of outliers equal to contamination parameter when not
# set to 'auto'
contamination = 0.1
expected_outliers = 30
contamination = float(expected_outliers)/n_samples
estimator.set_params(contamination=contamination)
y_pred = estimator.fit_predict(X)
assert_almost_equal(np.mean(y_pred != 1), contamination)

num_outliers = np.sum(y_pred != 1)
# num_outliers should be equal to expected_outliers unless
# there are ties in the decision_function values. this can
# only be tested for estimators with a decision_function
# method
if (num_outliers != expected_outliers and
hasattr(estimator, 'decision_function')):
decision = estimator.decision_function(X)
check_outlier_corruption(num_outliers, expected_outliers, decision)

# raises error when contamination is a scalar and not in [0,1]
for contamination in [-0.5, 2.3]:
@@ -9,14 +9,16 @@
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import deprecated
from sklearn.utils import _joblib
from sklearn.utils.testing import (assert_raises_regex, assert_equal,
ignore_warnings, assert_warns)
from sklearn.utils.testing import (assert_raises_regex,
assert_equal, ignore_warnings,
assert_warns, assert_raises)
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.estimator_checks import set_random_state
from sklearn.utils.estimator_checks import set_checking_parameters
from sklearn.utils.estimator_checks import check_estimators_unfitted
from sklearn.utils.estimator_checks import check_fit_score_takes_y
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
from sklearn.utils.estimator_checks import check_outlier_corruption
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.mixture import GaussianMixture
@@ -360,6 +362,15 @@ def test_check_estimator():
check_estimator(MultiTaskElasticNet())


def test_check_outlier_corruption():
# should raise AssertionError
decision = np.array([0., 1., 1.5, 2.])
assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision)
# should pass
decision = np.array([0., 1., 1., 2.])
check_outlier_corruption(1, 2, decision)


def test_check_estimator_transformer_no_mixin():
# check that TransformerMixin is not required for transformer tests to run
assert_raises_regex(AttributeError, '.*fit_transform.*',
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.