BF+TST : refactor outliers script with testing

practical-neuroimaging · Apr 23, 2013 · 3a80000 · 3a80000
1 parent d120525
commit 3a80000
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 6 deletions.
diff --git a/outliers.py b/outliers.py
@@ -43,8 +43,10 @@ def compute_mu_var(y):
     df = N - 1
     # Mean for each row
     mu = y.mean(axis=1)
+    # The mean removed the second axis. Restore it (length 1) so we can subtract
+    subtracting_mu = np.reshape(mu, (P, 1))
     # Remove mean
-    yc = y - mu
+    yc = y - subtracting_mu
     # Variance(s) and covariances
     var = yc.dot(yc.T) / df
     return mu, var
@@ -67,19 +69,30 @@ def compute_mahal(y, mu, var):
     y : (N,) or (P, N) ndarray
         One row per measure, one column per observation. If a vector, treat as a
         (1, N) array
-    mu : (P,) ndarray
-        Mean of each measure across columns
-    var : (P, P) ndarray
-        Variances (diagonal) and covariances of measures
+    mu : (P,) array-like
+        Mean of each measure across columns.  Can be scalar, array or sequence
+        (list, tuple)
+    var : (P, P) array-like
+        Variances (diagonal) and covariances of measures. Can be scalar, array
+        or sequence (list, tuple)
 
     Returns
     -------
     mahals : (N,) ndarray
         Mahalanobis distances of each observation from `mu`, given `var`
     """
+    # Make sure y is a row vector, if it was only a 1D vector
     y = np.atleast_2d(y)
+    # Shapes
+    P, N = y.shape
+    # Make sure mu and var are arrays
+    mu = np.asarray(mu)
+    # Variance should also be 2D (even if shape (1, 1)) - for np.linalg.inv
+    var = np.atleast_2d(var)
+    # The mean should be shape (P,).  It needs to be (P, 1) shape to subtract
+    subtracting_mu = np.reshape(mu, (P, 1))
     # Mean correct
-    yc = y - mu
+    yc = y - subtracting_mu
     # Correct for (co)variances. For single row, this is the same as dividing by
     # the variance
     y_white = np.linalg.inv(var).dot(yc)

diff --git a/test_outliers.py b/test_outliers.py
@@ -0,0 +1,61 @@
+""" Testing outlier detection
+"""
+
+import numpy as np
+
+import outliers
+
+# Only needed if working interactively
+reload(outliers)
+
+from nose.tools import assert_equal, assert_true
+from numpy.testing import assert_almost_equal
+
+
+def test_compute_mu_var():
+    # Test computation of maybe multivariable mean and variance
+    assert_equal(outliers.compute_mu_var([[1, 1, 1, 1]]), (1, 0))
+    assert_equal(outliers.compute_mu_var([[-1, 0, 1]]), (0, 1))
+    # Make a random number generator, seed it to make numbers predictable
+    rng = np.random.RandomState(42)
+    vector = rng.normal(3, 7, size=(100,))
+    mu, var = outliers.compute_mu_var(vector)
+    assert_almost_equal(mu, vector.mean())
+    # We used 1 df for the variance estimation
+    assert_almost_equal(var, vector.var(ddof=1))
+    # Does it also work for a 2D (row) vector?
+    mu, var = outliers.compute_mu_var(vector.reshape((1, 100)))
+    assert_almost_equal(mu, vector.mean())
+    assert_almost_equal(var, vector.var(ddof=1))
+    # A list ?
+    mu, var = outliers.compute_mu_var(vector.tolist())
+    assert_almost_equal(mu, vector.mean())
+    assert_almost_equal(var, vector.var(ddof=1))
+    # 2D matrix
+    arr2d = rng.normal(3, 7, size=(2, 100))
+    mu, var = outliers.compute_mu_var(arr2d)
+    assert_almost_equal(mu, arr2d.mean(axis=1))
+    demeaned = arr2d - mu[:, None]
+    est_var = np.dot(demeaned, demeaned.T) / 99
+    assert_almost_equal(var, est_var)
+
+
+def test_mahal():
+    # Test mahalanobis distance
+    # Basic 1D check - lists as input, 2D row vector list
+    assert_almost_equal(
+        outliers.compute_mahal([[-1, 0, 1]], 1, 1), [ 4.,  1.,  0.])
+    # Arrays as input, 1D vector
+    assert_almost_equal(
+        outliers.compute_mahal(np.array([-1, 0, 1]), np.array(1), np.array(1)),
+        [ 4.,  1.,  0.])
+    # For some random numbers
+    rng = np.random.RandomState(42)
+    vector = rng.normal(3, 7, size=(100,))
+    distances = outliers.compute_mahal(vector, 3, 7)
+    z = (vector - 3) / 7.
+    assert_almost_equal(distances, z ** 2)
+
+
+def test_estimate_mu_var():
+    pass