Add membership_matrix() function to pyss3.util (#5)

This function converts the list of training/test labels (i.e., y_train/y_test) into a membership matrix. This function is useful when working with multi-label classification problems and it is meant to be used only internally by the evaluation module (the ``Evaluation`` class). However, in case users want to perform model evaluations using custom evaluation metrics, they could use this function as shown in the following example, in which the performance will be measured in terms of Hamming loss: ``` from pyss3 import SS3 from pyss3.util import Dataset, membership_matrix from sklearn.metrics import hamming_loss x_train, y_train = Dataset.load_from_files_multilabel(...) x_test, y_test = Dataset.load_from_files_multilabel(...) clf = SS3() clf.train(x_train, y_train) y_pred = clf.predict(x_test, multilabel=True) y_test_mem = membership_matrix(clf, y_test) y_pred_mem = membership_matrix(clf, y_pred) hamming_loss(y_test_mem, y_pred_mem) ``` Documentation available here: https://pyss3.rtfd.io/en/latest/api/index.html#pyss3.util.membership_matrix
sergioburdisso · May 13, 2020 · 983bc5a · 983bc5a
1 parent 444e9bc
commit 983bc5a
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 1 deletion.
diff --git a/pyss3/util.py b/pyss3/util.py
@@ -2090,3 +2090,56 @@ def list_by_force(v):
         return v
     except TypeError:
         return [v]
+
+
+def membership_matrix(clf, y_data):
+    """
+    Transform a list of (multiple) labels into a "membership matrix".
+
+    The membership matrix consists converting each list of category labels
+    (i.e., each *y* in ``y_data``) into a vector in which there's a fixed
+    position associated to each learned category, having the value 1 for each
+    label in *y*, and 0 otherwise.
+
+    When working with multi-label classification problems, this representation
+    enables measuring the performance using common evaluation metrics such
+    as Hamming loss, exact match ratio, accuracy, precision, recall, F1, etc.
+
+    For instance, suppose ``y_data = [[], ['labelA'], ['labelB'], ['labelA', 'labelC']]``
+    and that the classifier ``clf`` has been trained on 3 categories whose labels are
+    'labelA', 'labelB', and 'labelC', then, we would have that:
+
+    >>> labels2membership(clf, [[], ['labelA'], ['labelB'], ['labelA', 'labelC']])
+
+    returns the following membership matrix:
+
+    >>> [[0, 0, 0],  # []
+    >>>  [1, 0, 0],  # ['labelA']
+    >>>  [0, 1, 0],  # ['labelB']
+    >>>  [1, 0, 1]]  # ['labelA', 'labelC']
+
+    :param clf: the trained classifier
+    :type clf: SS3
+    :param y_data: the list of document labels
+    :type y_data: list of list of str
+    :returns: a (sparse) matrix in which each row is the membership vector of
+              each element (labels) in ``y_data``.
+    :rtype: scipy.sparse.lil.lil_matrix
+    :raises: ValueError
+    """
+    if not clf.__categories__:
+        raise ValueError("The `clf` classifier has not been trained yet!")
+
+    from scipy import sparse
+
+    labels2index = dict([(c, i) for i, c in enumerate(clf.get_categories())])
+    y_data_matrix = sparse.lil_matrix((len(y_data), len(labels2index)), dtype="b")
+
+    try:
+        for i, labels in enumerate(y_data):
+            labels = [labels2index[l] for l in labels]
+            y_data_matrix[i, labels] = 1
+    except KeyError as e:
+        raise ValueError("The `y_data` contains an unknown label (%s)" % str(e))
+
+    return y_data_matrix
diff --git a/tests/test_pyss3.py b/tests/test_pyss3.py
@@ -2,7 +2,7 @@
 """Tests for pyss3."""
 from os import path
 from shutil import rmtree
-from pyss3.util import Dataset
+from pyss3.util import Dataset, membership_matrix
 from pyss3 import \
     SS3, STR_NORM_GV_XAI, STR_NORM_GV, STR_GV, \
     STR_XAI, STR_VANILLA, STR_MOST_PROBABLE, \
@@ -265,11 +265,28 @@ def test_multilabel():
     )
 
     clf = SS3()
+
+    with pytest.raises(ValueError):
+        membership_matrix(clf, [])
+
     clf.fit(x_train, y_train)
 
     assert sorted(clf.get_categories()) == ['insult', 'obscene', 'severe_toxic', 'toxic']
     assert clf.classify_multilabel("this is a unknown document!") == []
 
+    y_pred = [[], ['toxic'], ['severe_toxic'], ['obscene'], ['insult'], ['toxic', 'insult']]
+
+    with pytest.raises(ValueError):
+        membership_matrix(clf, y_pred + [["xxx"]])
+
+    y_pred_memmatrix = membership_matrix(clf, y_pred).todense().tolist()
+    assert y_pred_memmatrix == [[0, 0, 0, 0],  # []
+                                [1, 0, 0, 0],  # ['toxic']
+                                [0, 1, 0, 0],  # ['severe_toxic']
+                                [0, 0, 1, 0],  # ['obscene']
+                                [0, 0, 0, 1],  # ['insult']
+                                [1, 0, 0, 1]]  # ['toxic', 'insult']
+
 
 def test_pyss3_ss3(mockers):
     """Test SS3."""