Add first version of nDCG evaluation

rjagerman · Jun 28, 2017 · 668150c · 668150c
1 parent 524ade8
commit 668150c
Show file tree

Hide file tree

Showing 2 changed files with 167 additions and 0 deletions.
diff --git a/shoelace/evaluation.py b/shoelace/evaluation.py
@@ -0,0 +1,62 @@
+from chainer import cuda, function
+
+
+class NDCG(function.Function):
+    def __init__(self, k=0):
+        self.k = k
+
+    def forward(self, inputs):
+        xp = cuda.get_array_module(*inputs)
+        y, t = inputs
+
+        # Assert arrays have the same shape
+        if t.shape != y.shape:
+            raise ValueError("Input arrays have different shapes")
+
+        # Computing nDCG on empty array should just return 0.0
+        if t.shape[0] == 0:
+            return xp.asarray(0.0),
+
+        # Compute predicted indices by arg sorting
+        predicted_indices = xp.argsort(y)
+        best_indices = xp.argsort(t)
+
+        # Predicted and theoretically best relevance labels
+        predicted_relevance = xp.flip(t[predicted_indices], axis=0)
+        best_relevance = xp.flip(t[best_indices], axis=0)
+
+        # Compute needed statistics
+        length = predicted_relevance.shape[0]
+        arange = xp.arange(length)
+        last = min(self.k, length)
+        if last < 1:
+            last = length
+
+        # Compute regular DCG
+        dcg_numerator = 2 ** predicted_relevance[:last] - 1
+        dcg_denominator = xp.log2(arange[:last] + 2)
+        dcg = xp.sum(dcg_numerator / dcg_denominator)
+
+        # Compute iDCG for normalization
+        idcg_numerator = (2 ** best_relevance[:last] - 1)
+        idcg_denominator = (xp.log2(arange[:last] + 2))
+        idcg = xp.sum(idcg_numerator / idcg_denominator)
+
+        if idcg == 0.0:
+            return xp.asarray(1.0),
+
+        return xp.asarray(dcg / idcg),
+
+
+def ndcg(y, t, k=0):
+    """
+    Computes the nDCG@k for given list of true relevance labels (y_true) and
+    given list of predicted relevance labels (y_score)
+
+    :param y_true: The ground truth relevance labels 
+    :param y_score: The predicted relevance scores
+    :param k: The cut-off point (if set to smaller or equal to 0, it does not
+              cut-off)
+    :return: The nDCG@k value
+    """
+    return NDCG(k=k)(y, t)
diff --git a/test/test_evaluation.py b/test/test_evaluation.py
@@ -0,0 +1,105 @@
+import numpy as np
+from nose.tools import raises, assert_equal
+
+from shoelace.evaluation import ndcg
+
+
+def test_ndcg():
+
+    # Set up data
+    prediction = np.array([0.1, 0.9, 0.2, 3.0, 0.15])
+    ground_truth = np.array([3.0, 3.0, 2.0, 1.0, 1.0])
+
+    # Compute and assert nDCG value
+    assert_equal(ndcg(prediction, ground_truth).data, 0.73213389587665278)
+
+
+def test_ndcg_2():
+
+    # Set up data
+    prediction = np.array([0.1, 0.9, 0.2, 0.15, 3.0])
+    ground_truth = np.array([3.0, 3.0, 2.0, 1.0, 1.0])
+
+    # Compute and assert nDCG value
+    assert_equal(ndcg(prediction, ground_truth).data, 0.73213389587665278)
+
+
+def test_ndcg_3():
+
+    # Set up data
+    prediction = np.array([0.1, 0.9, 0.2, 0.15, 3.0])
+    ground_truth = np.array([3.0, 3.0, 2.0, 1.0, 2.0])
+
+    # Compute and assert nDCG value
+    assert_equal(ndcg(prediction, ground_truth).data, 0.8259562683091511)
+
+
+def test_ndcg_perfect():
+
+    # Set up data
+    prediction = np.array([4.0, 3.0, 2.0, 1.0, 0.0])
+    ground_truth = np.array([3.0, 3.0, 2.0, 1.0, 1.0])
+
+    # Compute and assert nDCG value
+    assert_equal(ndcg(prediction, ground_truth).data, 1.0)
+
+
+def test_ndcg_minimal():
+
+    # Set up data
+    prediction = np.arange(10).astype(dtype=np.float32)
+    ground_truth = np.flip(prediction, axis=0)
+
+    # Compute and assert nDCG value
+    assert_equal(ndcg(prediction, ground_truth).data, 0.39253964576233569)
+
+
+def test_ndcg_at_k():
+
+    # Set up data
+    prediction = np.array([0.3, 0.3, 0.2, 2.14, 0.23])
+    ground_truth = np.array([3.0, 3.0, 2.0, 1.0, 1.0])
+
+    # Compute and assert nDCG@3 value
+    assert_equal(ndcg(prediction, ground_truth, k=3).data, 0.69031878315427031)
+
+
+def test_empty_ndcg():
+
+    # Set up data
+    prediction = np.array([])
+    ground_truth = np.array([])
+
+    # Assert nDCG of empty lists
+    assert_equal(ndcg(prediction, ground_truth).data, 0.0)
+
+
+def test_ndcg_no_preferences():
+
+    # Set up data
+    prediction = np.array([0.3, 0.3, 0.2, 2.14, 0.23])
+    ground_truth = np.array([0.0, 0.0, 0.0, 0.0, 0.0])
+
+    # Compute and assert nDCG value
+    assert_equal(ndcg(prediction, ground_truth).data, 1.0)
+
+
+def test_ndcg_negative_predictions():
+
+    # Set up data
+    prediction = np.array([-0.1, -0.3, 1.9, -0.9, -0.2])
+    ground_truth = np.array([0.0, 1.0, 1.0, 0.0, 0.0])
+
+    # Compute and assert nDCG value
+    assert_equal(ndcg(prediction, ground_truth).data, 0.8772153153380493)
+
+
+@raises(ValueError)
+def test_unequal_ndcg():
+
+    # Set up data
+    prediction = np.array([0.3, 0.3, 0.2])
+    ground_truth = np.array([3.0, 3.0, 2.0, 1.0, 1.0, 2.3])
+
+    # This should raise a ValueError because the lists aren't of equal length
+    ndcg(prediction, ground_truth)