In [2]:
from __future__ import print_function

from sklearn.datasets import drop_values
import numpy as np

X = [[0, 1, 2],
     [3, 4, 5],
     [6, 7, 8],
     [9, 0, 1],
     [2, 3, 4],
     [8, 9, 8],
     [1, 0, 5],
     [7, 8, 9],
     [5, 4, 3],
     [2, 1, 1],
     [3, 4, 5],
     [2, 3, 4],
     [8, 9, 8],
     [1, 0, 5],
     [7, 8, 9],]
y = [0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2]

# Drop 10% of values across all features for samples
# where the target class label is randomly chosen

X, y, mm, labels = drop_values(X, y,
                               drop_fraction=0.1,
                               return_missing_mask=True,
                               return_labels=True,
                               copy=False,
                               verbose=True,
                               random_state=42)

print("After dropping 10%% of values when class label(s) are %r\n" % labels)
print("y \t X")
print("------------------------")
for i in range(y.shape[0]):
    print(y[i], '\t', X[i])
    
    
print("\n\n")
# Drop 10% of values across all features for samples
# where the target class label is same as what was chosen before

# NOTE We can now pass the missing mask from the previous step
# to avoid it getting recomputed.

X, y, mm = drop_values(X, y, missing_mask=mm,
                       drop_fraction=0.2,
                       labels=labels,
                       return_labels=False,
                       return_missing_mask=True,
                       copy=False,
                       verbose=True,
                       random_state=42)

print("After dropping another 10%% of values when class label(s) are %r\n" % labels)
print("y \t X")
print("------------------------")
for i in range(y.shape[0]):
    print(y[i], '\t', X[i])

print('\n\n')
    
# Now drop another 10%, but this time from class 0
# This time let us not modify X inplace and instead return the missing mask and 
# manually set the missing_values

# This time we are not passing the previous missing_mask and allowing it to get computed
# on the fly

# Let us store the old missing mask
mm_old = mm.copy()

X, y, mm = drop_values(X, y,
                       drop_fraction=0.3,
                       # Explicitly specify we want missing values correlated to class 0
                       labels=[1, ],
                       return_labels=False,
                       return_missing_mask=True,
                       missing_mask_only=True,
                       copy=False,
                       verbose=True,
                       random_state=42)

print("NOTE that the missing_values are set. Only the missing mask is updated...")
print("y \t missing_mask")
print("------------------------")
for i in range(y.shape[0]):
    print(y[i], '\t', mm[i])
    
print('\n\n')
print('\nThe X is not modified')
print("y \t X")
print("------------------------")
for i in range(y.shape[0]):
    print(y[i], '\t', X[i])
    
print('\n\n')

    
# Manually update the missing values from the mask
# only for the newly missing values

mm_new = mm_old ^ mm
X[mm_new] = np.nan

print("After manually updating the new missing values")
print("y \t X")
print("------------------------")
for i in range(y.shape[0]):
    print(y[i], '\t', X[i])
    
print('\n\n')

# Now let us add additional 10% of random missing values

X, y = drop_values(X, y,
                   drop_fraction=0.4,
                   # Explicitly specify we want missing values correlated to class 0
                   label_correlation=0,
                   copy=False,
                   verbose=True,
                   random_state=42)

print("y \t X")
print("------------------------")
for i in range(y.shape[0]):
    print(y[i], '\t', X[i])

There are 4 correlated and 0 non-correlated new missing values introduced.
After dropping 10% of values when class label(s) are array([0])

y 	 X
------------------------
0 	 [ nan   1.   2.]
1 	 [ 3.  4.  5.]
1 	 [ 6.  7.  8.]
1 	 [ 9.  0.  1.]
1 	 [ 2.  3.  4.]
0 	 [ 8.  9.  8.]
0 	 [ 1.  0.  5.]
0 	 [ nan   8.  nan]
0 	 [  5.  nan   3.]
1 	 [ 2.  1.  1.]
2 	 [ 3.  4.  5.]
2 	 [ 2.  3.  4.]
2 	 [ 8.  9.  8.]
2 	 [ 1.  0.  5.]
2 	 [ 7.  8.  9.]



There are 5 correlated and 0 non-correlated new missing values introduced.
After dropping another 10% of values when class label(s) are array([0])

y 	 X
------------------------
0 	 [ nan  nan   2.]
1 	 [ 3.  4.  5.]
1 	 [ 6.  7.  8.]
1 	 [ 9.  0.  1.]
1 	 [ 2.  3.  4.]
0 	 [ nan   9.   8.]
0 	 [ nan   0.   5.]
0 	 [ nan   8.  nan]
0 	 [ nan  nan  nan]
1 	 [ 2.  1.  1.]
2 	 [ 3.  4.  5.]
2 	 [ 2.  3.  4.]
2 	 [ 8.  9.  8.]
2 	 [ 1.  0.  5.]
2 	 [ 7.  8.  9.]



There are 4 correlated and 0 non-correlated new missing values introduced.
NOTE 

In [3]:
from sklearn.utils.multiclass import type_of_target

In [6]:
type_of_target([[0.2, 0.2], [0.3, 0.5], [0.5, 0.2]])

'continuous-multioutput'

In [None]:
from sklearn.datasets import make_classification
from sklearn.datasets import 

In [2]:
>>> from sklearn.datasets import drop_values
>>> X = [[0, 1, 2],
...      [3, 4, 5],
...      [6, 7, 8],
...      [9, 0, 1],
...      [2, 3, 4],
...      [8, 9, 8],
...      [1, 0, 5],
...      [7, 8, 9],
...      [5, 4, 3],
...      [2, 1, 1],
...      [1, 2, 3]]
>>> y = [0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1]
>>> X, y, labels = drop_values(X, y, drop_fraction=0.1,
...                            n_labels=1, return_labels=True,
...                            copy=False, random_state=42)
>>> labels
array([1])
>>> X

NameError: name 'array' is not defined

In [3]:
import numpy as np

import scipy as sp

In [6]:
from sklearn.metrics import mean_squared_error

np.var([11, 12, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11])

0.22749999999999995

TypeError: allclose() takes at least 2 arguments (1 given)

In [79]:
from sklearn.datasets import drop_values
from sklearn.datasets import make_classification, make_regression
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_less
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_almost_equal

def test_value_dropper_mcar_clf():
    X, y = make_classification(n_samples=100, n_classes=4, n_features=5,
                               n_informative=5, n_redundant=0, n_repeated=0,
                               random_state=0)

    # Inplace dropping of values; 0 correlation case.
    _, _ = drop_values(X, y,
                       drop_fraction=0.3, label_correlation=0,
                       missing_values=np.nan, random_state=0)

    # Check the drop fraction
    assert_almost_equal(np.isnan(X).ravel().sum() / 500., 0.3)

    # Check that there is no correlation (The missing values are spread equally
    # between different label values)
    n_dropped_per_label = [(np.isnan(X)[y==lbl]).ravel().sum() for lbl in np.unique(y)]
    n_samples_per_label = np.bincount(y) * 5
    # That is all the per label missing fraction should be close to 0.3
    per_label_drop_fraction = (n_dropped_per_label / n_samples_per_label.astype(np.float))
    assert_almost_equal(per_label_drop_fraction.mean(), 0.3)
    assert_less(np.std(per_label_drop_fraction), 0.05)

    # Let us drop 0.3 more fraction of values. This time not inplace
    X_old = X.copy()
    X_more_dropped, _ = drop_values(
        X, y, drop_fraction=0.6, label_correlation=0,
        missing_values="NaN",
        copy=True, random_state=0)

    assert_almost_equal(X, X_old)
    try:
        assert_almost_equal(X_more_dropped, X)
    except AssertionError as e:
        assert_true("nan location mismatch" in str(e))

    # Check that there is no correlation (The missing values are spread equally
    # between different label values)
    n_dropped_per_label = [(np.isnan(X_more_dropped)[y==lbl]).ravel().sum() for lbl in np.unique(y)]
    n_samples_per_label = np.bincount(y) * 5
    # That is all the per label missing fraction should now be close to 0.6
    per_label_drop_fraction = (n_dropped_per_label / n_samples_per_label.astype(np.float))
    assert_almost_equal(per_label_drop_fraction.mean(), 0.6)
    assert_less(np.std(per_label_drop_fraction), 0.05)
    

def test_value_dropper_mcar_reg():
    X, y = make_regression(n_samples=100, n_classes=4, n_features=5,
                           n_informative=5, n_redundant=0, n_repeated=0,
                           random_state=0)

    # Inplace dropping of values; 0 correlation case.
    _, _ = drop_values(X, y,
                       drop_fraction=0.3, label_correlation=0,
                       missing_values=np.nan, random_state=0)

    # Check the drop fraction
    assert_almost_equal(np.isnan(X).ravel().sum() / 500., 0.3)

    # Check that there is no correlation (The missing values are spread equally
    # between different target values)
    
    # That is all the per label missing fraction should be close to 0.3
    per_label_drop_fraction = (n_dropped_per_label / n_samples_per_label.astype(np.float))
    assert_almost_equal(per_label_drop_fraction.mean(), 0.3)
    assert_less(np.std(per_label_drop_fraction), 0.05)

    # Let us drop 0.3 more fraction of values. This time not inplace
    X_old = X.copy()
    X_more_dropped, _ = drop_values(
        X, y, drop_fraction=0.6, label_correlation=0,
        missing_values="NaN",
        copy=True, random_state=0)

    assert_almost_equal(X, X_old)
    try:
        assert_almost_equal(X_more_dropped, X)
    except AssertionError as e:
        assert_true("nan location mismatch" in str(e))

    # Check that there is no correlation (The missing values are spread equally
    # between different label values)
    n_dropped_per_label = [(np.isnan(X_more_dropped)[y==lbl]).ravel().sum() for lbl in np.unique(y)]
    n_samples_per_label = np.bincount(y) * 5
    # That is all the per label missing fraction should now be close to 0.6
    per_label_drop_fraction = (n_dropped_per_label / n_samples_per_label.astype(np.float))
    assert_almost_equal(per_label_drop_fraction.mean(), 0.6)
    assert_less(np.std(per_label_drop_fraction), 0.05)