Skip to content

Commit

Permalink
BF+TST : refactor outliers script with testing
Browse files Browse the repository at this point in the history
  • Loading branch information
matthew-brett committed Apr 23, 2013
1 parent d120525 commit 3a80000
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 6 deletions.
25 changes: 19 additions & 6 deletions outliers.py
Expand Up @@ -43,8 +43,10 @@ def compute_mu_var(y):
df = N - 1
# Mean for each row
mu = y.mean(axis=1)
# The mean removed the second axis. Restore it (length 1) so we can subtract
subtracting_mu = np.reshape(mu, (P, 1))
# Remove mean
yc = y - mu
yc = y - subtracting_mu
# Variance(s) and covariances
var = yc.dot(yc.T) / df
return mu, var
Expand All @@ -67,19 +69,30 @@ def compute_mahal(y, mu, var):
y : (N,) or (P, N) ndarray
One row per measure, one column per observation. If a vector, treat as a
(1, N) array
mu : (P,) ndarray
Mean of each measure across columns
var : (P, P) ndarray
Variances (diagonal) and covariances of measures
mu : (P,) array-like
Mean of each measure across columns. Can be scalar, array or sequence
(list, tuple)
var : (P, P) array-like
Variances (diagonal) and covariances of measures. Can be scalar, array
or sequence (list, tuple)
Returns
-------
mahals : (N,) ndarray
Mahalanobis distances of each observation from `mu`, given `var`
"""
# Make sure y is a row vector, if it was only a 1D vector
y = np.atleast_2d(y)
# Shapes
P, N = y.shape
# Make sure mu and var are arrays
mu = np.asarray(mu)
# Variance should also be 2D (even if shape (1, 1)) - for np.linalg.inv
var = np.atleast_2d(var)
# The mean should be shape (P,). It needs to be (P, 1) shape to subtract
subtracting_mu = np.reshape(mu, (P, 1))
# Mean correct
yc = y - mu
yc = y - subtracting_mu
# Correct for (co)variances. For single row, this is the same as dividing by
# the variance
y_white = np.linalg.inv(var).dot(yc)
Expand Down
61 changes: 61 additions & 0 deletions test_outliers.py
@@ -0,0 +1,61 @@
""" Testing outlier detection
"""

import numpy as np

import outliers

# Only needed if working interactively
reload(outliers)

from nose.tools import assert_equal, assert_true
from numpy.testing import assert_almost_equal


def test_compute_mu_var():
# Test computation of maybe multivariable mean and variance
assert_equal(outliers.compute_mu_var([[1, 1, 1, 1]]), (1, 0))
assert_equal(outliers.compute_mu_var([[-1, 0, 1]]), (0, 1))
# Make a random number generator, seed it to make numbers predictable
rng = np.random.RandomState(42)
vector = rng.normal(3, 7, size=(100,))
mu, var = outliers.compute_mu_var(vector)
assert_almost_equal(mu, vector.mean())
# We used 1 df for the variance estimation
assert_almost_equal(var, vector.var(ddof=1))
# Does it also work for a 2D (row) vector?
mu, var = outliers.compute_mu_var(vector.reshape((1, 100)))
assert_almost_equal(mu, vector.mean())
assert_almost_equal(var, vector.var(ddof=1))
# A list ?
mu, var = outliers.compute_mu_var(vector.tolist())
assert_almost_equal(mu, vector.mean())
assert_almost_equal(var, vector.var(ddof=1))
# 2D matrix
arr2d = rng.normal(3, 7, size=(2, 100))
mu, var = outliers.compute_mu_var(arr2d)
assert_almost_equal(mu, arr2d.mean(axis=1))
demeaned = arr2d - mu[:, None]
est_var = np.dot(demeaned, demeaned.T) / 99
assert_almost_equal(var, est_var)


def test_mahal():
# Test mahalanobis distance
# Basic 1D check - lists as input, 2D row vector list
assert_almost_equal(
outliers.compute_mahal([[-1, 0, 1]], 1, 1), [ 4., 1., 0.])
# Arrays as input, 1D vector
assert_almost_equal(
outliers.compute_mahal(np.array([-1, 0, 1]), np.array(1), np.array(1)),
[ 4., 1., 0.])
# For some random numbers
rng = np.random.RandomState(42)
vector = rng.normal(3, 7, size=(100,))
distances = outliers.compute_mahal(vector, 3, 7)
z = (vector - 3) / 7.
assert_almost_equal(distances, z ** 2)


def test_estimate_mu_var():
pass

0 comments on commit 3a80000

Please sign in to comment.