Skip to content

Commit

Permalink
Add membership_matrix() function to pyss3.util (#5)
Browse files Browse the repository at this point in the history
This function converts the list of training/test labels (i.e.,
y_train/y_test) into a membership matrix. This function is useful when
working with multi-label classification problems and it is meant to be
used only internally by the evaluation module (the ``Evaluation``
class). However, in case users want to perform model evaluations using
custom evaluation metrics, they could use this function as shown in the
following example, in which the performance will be measured in terms of
Hamming loss:

```
from pyss3 import SS3
from pyss3.util import Dataset, membership_matrix

from sklearn.metrics import hamming_loss

x_train, y_train = Dataset.load_from_files_multilabel(...)
x_test, y_test = Dataset.load_from_files_multilabel(...)

clf = SS3()
clf.train(x_train, y_train)

y_pred = clf.predict(x_test, multilabel=True)

y_test_mem = membership_matrix(clf, y_test)
y_pred_mem = membership_matrix(clf, y_pred)

hamming_loss(y_test_mem, y_pred_mem)
```

Documentation available here:
https://pyss3.rtfd.io/en/latest/api/index.html#pyss3.util.membership_matrix
  • Loading branch information
sergioburdisso committed May 13, 2020
1 parent 444e9bc commit 983bc5a
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 1 deletion.
53 changes: 53 additions & 0 deletions pyss3/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2090,3 +2090,56 @@ def list_by_force(v):
return v
except TypeError:
return [v]


def membership_matrix(clf, y_data):
"""
Transform a list of (multiple) labels into a "membership matrix".
The membership matrix consists converting each list of category labels
(i.e., each *y* in ``y_data``) into a vector in which there's a fixed
position associated to each learned category, having the value 1 for each
label in *y*, and 0 otherwise.
When working with multi-label classification problems, this representation
enables measuring the performance using common evaluation metrics such
as Hamming loss, exact match ratio, accuracy, precision, recall, F1, etc.
For instance, suppose ``y_data = [[], ['labelA'], ['labelB'], ['labelA', 'labelC']]``
and that the classifier ``clf`` has been trained on 3 categories whose labels are
'labelA', 'labelB', and 'labelC', then, we would have that:
>>> labels2membership(clf, [[], ['labelA'], ['labelB'], ['labelA', 'labelC']])
returns the following membership matrix:
>>> [[0, 0, 0], # []
>>> [1, 0, 0], # ['labelA']
>>> [0, 1, 0], # ['labelB']
>>> [1, 0, 1]] # ['labelA', 'labelC']
:param clf: the trained classifier
:type clf: SS3
:param y_data: the list of document labels
:type y_data: list of list of str
:returns: a (sparse) matrix in which each row is the membership vector of
each element (labels) in ``y_data``.
:rtype: scipy.sparse.lil.lil_matrix
:raises: ValueError
"""
if not clf.__categories__:
raise ValueError("The `clf` classifier has not been trained yet!")

from scipy import sparse

labels2index = dict([(c, i) for i, c in enumerate(clf.get_categories())])
y_data_matrix = sparse.lil_matrix((len(y_data), len(labels2index)), dtype="b")

try:
for i, labels in enumerate(y_data):
labels = [labels2index[l] for l in labels]
y_data_matrix[i, labels] = 1
except KeyError as e:
raise ValueError("The `y_data` contains an unknown label (%s)" % str(e))

return y_data_matrix
19 changes: 18 additions & 1 deletion tests/test_pyss3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""Tests for pyss3."""
from os import path
from shutil import rmtree
from pyss3.util import Dataset
from pyss3.util import Dataset, membership_matrix
from pyss3 import \
SS3, STR_NORM_GV_XAI, STR_NORM_GV, STR_GV, \
STR_XAI, STR_VANILLA, STR_MOST_PROBABLE, \
Expand Down Expand Up @@ -265,11 +265,28 @@ def test_multilabel():
)

clf = SS3()

with pytest.raises(ValueError):
membership_matrix(clf, [])

clf.fit(x_train, y_train)

assert sorted(clf.get_categories()) == ['insult', 'obscene', 'severe_toxic', 'toxic']
assert clf.classify_multilabel("this is a unknown document!") == []

y_pred = [[], ['toxic'], ['severe_toxic'], ['obscene'], ['insult'], ['toxic', 'insult']]

with pytest.raises(ValueError):
membership_matrix(clf, y_pred + [["xxx"]])

y_pred_memmatrix = membership_matrix(clf, y_pred).todense().tolist()
assert y_pred_memmatrix == [[0, 0, 0, 0], # []
[1, 0, 0, 0], # ['toxic']
[0, 1, 0, 0], # ['severe_toxic']
[0, 0, 1, 0], # ['obscene']
[0, 0, 0, 1], # ['insult']
[1, 0, 0, 1]] # ['toxic', 'insult']


def test_pyss3_ss3(mockers):
"""Test SS3."""
Expand Down

0 comments on commit 983bc5a

Please sign in to comment.