In [56]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload


# Given a model output directory :
# 1. Output its overall performance train/dev/test
# 2. Output its per slot performance train/dev/test
# 3. Per slot comparison with other method e.g. baseline model
# 4. 
def evaluate_conll(evaluable):
    """Evaluate sequence tagging hypothesis with CoNLL criteria.

    Example
    -------
    >>> reference = [
    ...     ['B-PER', 'I-PER', 'O'],
    ...     ['B-LOC', 'I-LOC', 'I-LOC', 'O'],
    ...     ['B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC'],
    ...     ['O', 'B-PER']
    ... ]
    >>> hypothesis = [
    ...     ['B-PER', 'I-PER', 'O'],
    ...     ['B-PER', 'I-PER', 'I-PER', 'O'],
    ...     ['O', 'B-PER', 'O', 'B-PER', 'O'],
    ...     ['O', 'O']
    ... ]
    >>> evaluable = [zip(*sent_pair) for sent_pair in zip(reference, hypothesis)]
    >>> overall, by_type = evaluate_conll(evaluable)
    >>> '{0.precision:.2f} {0.recall:.2f}'.format(overall)
    '0.25 0.20'
    >>> '{0.precision:.2f} {0.recall:.2f}'.format(by_type['PER'])
    '0.25 0.33'

    Arguments
    ---------
    evaluable : `iterable`_
        An iterable of sentences where each sentence is an iterable of tag pairs. A tag pair
        ``(ref_tag, hyp_tag)`` at position ``i`` means that the ``i``-th word in that sentence
        has a true tag of ``ref_tag`` and a predicted tag of ``hyp_tag``.

    Returns
    -------
    overall : :obj:`~collections.namedtuple`
         The overall evaluation result. Precision, recall, and F1 score can be accessed via
         ``precision``, ``recall``, and ``f1_score`` attributes respectively.
    by_type : :obj:`dict`
         The evaluation result per tag type. The keys are the tag types (e.g. ``'PER'``)
         and the values are namedtuples with the same structure as ``overall``.

    Notes
    -----
    This evaluation function supports both IOB and IOBES tagging.

    .. _iterable:
        https://docs.python.org/3/glossary.html#term-iterable
    """
    return metrics(evaluate(chain(*evaluable)))


class ConfusionMatrix:
    """A confusion matrix.

    This class can be used to compute the confusion matrix of a multiclass
    classification problem.

    Example
    -------
    >>> reference  = 'a b a a b c'.split()
    >>> hypothesis = 'a a c a b c'.split()
    >>> cm = ConfusionMatrix(zip(reference, hypothesis))
    >>> cm['a', 'a']  # actual class is 'a' and predicted as 'a'
    2
    >>> cm['a', 'b']  # actual class is 'a' but predicted as 'b'
    0
    >>> cm['a', 'c']  # actual class is 'a' but predicted as 'c'
    1
    >>> cm = ConfusionMatrix(zip(reference, hypothesis), marginalize=True)
    >>> '{:.2f}'.format(cm['a', 'a'])
    '0.67'
    >>> '{:.2f}'.format(cm['a', 'b'])
    '0.00'
    >>> '{:.2f}'.format(cm['a', 'c'])
    '0.33'
    >>> print(cm)
         |    a    b    c
    -----+---------------
    a    | 0.67    . 0.33
    b    | 0.50 0.50    .
    c    |    .    . 1.00
    (row = reference, column = hypothesis)

    Arguments
    ---------
    pairs : `iterable`_
        An iterable of ``(ref_class, hyp_class)`` tuple.
    marginalize : bool
        Whether to marginalize the confusion values over the hypothesis.

    Attributes
    ----------
    marginalized : bool
        Whether the confusion matrix is marginalized.
    classes : list
        List of all classes found in ``pairs``.
    """
    def __init__(self, pairs, marginalize=False):
        self.marginalized = marginalize

        pairs = list(pairs)  # ensure pairs can be iterated twice
        classes = set()
        for ref, hyp in pairs:
            classes.update((ref, hyp))
        self.classes = list(classes)

        self._cm = {}
        for c1 in self.classes:
            self._cm[c1] = {}
            for c2 in self.classes:
                # we don't use defaultdict because we want
                # to raise an error if unseen pair is queried
                self._cm[c1][c2] = 0
        for ref, hyp in pairs:
            assert ref in self._cm
            assert hyp in self._cm[ref]
            self._cm[ref][hyp] += 1

        if marginalize:
            for c1 in self._cm:
                total = sum(self._cm[c1].values())
                for c2 in self._cm[c1]:
                    self._cm[c1][c2] /= total

    def __getitem__(self, pair):
        """Get the confusion value of the given tuple of reference and hypothesis class."""
        ref, hyp = pair
        return self._cm[ref][hyp]

    def __str__(self):
        """Return the confusion matrix as a pretty-formatted string."""
        # Convert everything to its string representation
        classes = list(sorted(str(c) for c in self.classes))
        cm = {str(c1): {str(c2): self._val2str(v) for c2, v in self._cm[c1].items()}
              for c1 in self._cm}

        # Compute column width
        max_class_length = max(len(c) for c in classes)
        max_val_length = max(len(val) for c in cm for val in cm[c].values())
        colwidth = max(max_class_length, max_val_length) + 1  # account for margin

        header = '{}|{}'.format(' ' * colwidth, ''.join([c.rjust(colwidth) for c in classes]))
        rule = '{}+{}'.format('-' * colwidth, '-' * colwidth * len(classes))
        out = [header, rule]
        for c1 in classes:
            first_col = c1.ljust(colwidth)
            rem_cols = ''.join([cm[c1][c2].rjust(colwidth) for c2 in classes])
            out.append('{}|{}'.format(first_col, rem_cols))
        out.append('(row = reference, column = hypothesis)')
        return '\n'.join(out)

    def to_array(self):
        """Return the confusion matrix as a NumPy array.

        Returns
        -------
        numpy.ndarray
            The confusion matrix as NumPy array. The row and column order corresponds to
            the order of the classes in :attr:`~ConfusionMatrix.classes`.
        """
        import numpy as np
        
        return np.array([[self._cm[c1][c2] for c2 in self.classes] for c1 in self.classes])

    def _val2str(self, val):
        if self.marginalized:
            return '.' if -1e-6 <= val <= 1e-6 else '{:.2f}'.format(val)
        else:
            return '.' if not val else str(val)


def confusion_matrix_conll(evaluable, marginalize=False):
    """Compute confusion matrix of a CoNLL sequence tagging hypothesis.

    Example
    -------
    >>> reference = [
    ...     ['B-PER', 'I-PER', 'O'],
    ...     ['B-LOC', 'I-LOC', 'I-LOC', 'O'],
    ...     ['B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC'],
    ...     ['O', 'B-PER'],
    ...     ['B-ORG', 'B-ORG']
    ... ]
    >>> hypothesis = [
    ...     ['B-PER', 'I-PER', 'O'],
    ...     ['B-PER', 'I-PER', 'I-PER', 'O'],
    ...     ['O', 'B-PER', 'O', 'B-PER', 'O'],
    ...     ['O', 'O'],
    ...     ['B-ORG', 'B-LOC']
    ... ]
    >>> evaluable = [zip(*sent_pair) for sent_pair in zip(reference, hypothesis)]
    >>> cm = confusion_matrix_conll(evaluable)
    >>> cm['O', 'PER']  # actual tag is O but predicted as PER
    2
    >>> cm['LOC', 'PER']  # actual tag is LOC but predicted as PER
    1

    Arguments
    ---------
    evaluable : `iterable`_
        An iterable of sentences where each sentence is an iterable of tag pairs. A tag pair
        ``(ref_tag, hyp_tag)`` at position ``i`` means that the ``i``-th word in that sentence
        has a true tag of ``ref_tag`` and a predicted tag of ``hyp_tag``.

    Returns
    -------
    :class:`ConfusionMatrix`
        The confusion matrix.

    Notes
    -----
    This function only supports IOB tagging.
    """
    return ConfusionMatrix(cast_as_multiclass(evaluable), marginalize=marginalize)

from collections import namedtuple
from itertools import product


Span = namedtuple('Span', 'start end tagname')


def cast_as_multiclass(evaluable):
    pairs = []
    for sent in evaluable:
        ref_sent, hyp_sent = zip(*sent)
        ref_spans = find_spans(ref_sent)
        hyp_spans = find_spans(hyp_sent)

        matched_ref, matched_hyp = set(), set()
        for i, ref_span in enumerate(ref_spans):
            for j, hyp_span in enumerate(hyp_spans):
                if match_exact(ref_span, hyp_span):
                    # Found a match between reference and hypothesis
                    matched_ref.add(i)
                    matched_hyp.add(j)
                    pairs.append((ref_span.tagname, hyp_span.tagname))
                    break
        # Unmatched spans are paired with O tags
        for i, ref_span in enumerate(ref_spans):
            if i not in matched_ref:
                pairs.append((ref_span.tagname, 'O'))
        for j, hyp_span in enumerate(hyp_spans):
            if j not in matched_hyp:
                pairs.append(('O', hyp_span.tagname))
    return pairs


def find_spans(tags):
    spans = []
    last_tagname = 'O'
    last_start = -1
    for i, tag in enumerate(tags):
        if tag != 'O':
            if tag.startswith('B-'):
                # Found a new span; need to append the span seen previously, if any
                if last_start >= 0:
                    assert last_tagname != 'O'
                    assert i > last_start
                    spans.append(Span(last_start, i - 1, last_tagname))
                last_tagname = tag[2:]
                last_start = i
            elif not tag.startswith('I-'):
                raise ValueError('tag {} is not a valid IOB tag'.format(tag))
            # TODO what if a span starts with I- tag?
        elif last_start >= 0:
            # Found O tag but we saw a span, so we append that
            assert last_tagname != 'O'
            assert i > last_start
            spans.append(Span(last_start, i - 1, last_tagname))
            last_tagname = 'O'
            last_start = -1
    # We reach EOS but we saw a span
    if last_start >= 0:
        assert last_tagname != 'O'
        assert len(tags) > last_start
        spans.append(Span(last_start, len(tags) - 1, last_tagname))
    return spans


def match_exact(span_a, span_b):
    return span_a.start == span_b.start and span_a.end == span_b.end


def plot_confusion_matrix(cm, classes, save_to, marginalized=True):
    # Import numpy and matplotlib here so users don't have to install
    # them if they don't need to plot
    import numpy as np
    import matplotlib
    matplotlib.use('AGG')  # use AGG backend because we only write to file
    import matplotlib.pyplot as plt

    cmap = plt.cm.Blues
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title('Confusion matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    fmt = '{:.2f}' if marginalized else '{:d}'
    for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, "{:,}".format(cm[i, j]),horizontalalignment='center', color='white' if cm[i, j] > thresh else 'black')

    #plt.tight_layout()
    plt.ylabel('Reference')
    plt.xlabel('Hypothesis')
    plt.savefig(save_to)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
from util.CoNLL import readCoNLL, readOntoNotes

# slots_stl = performance_per_slot("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/results/SingleTask_ATIS_Full/predictions/ATIS_25_dev.conll", out_file_name="SingleTask_ATIS_Full_ATIS_25_dev.tsv")
# slots_stl = performance_per_slot("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/results/SingleTask_MIT_Restaurant_Full/predictions/MIT_Restaurant_13_dev.conll", out_file_name="SingleTask_MIT_Restaurant_Full_MIT_Restaurant_13_dev.tsv")
# slots_stl = performance_per_slot("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/results/SingleTask_MIT_Movie_Full/predictions/MIT_Movie_19_dev.conll", out_file_name="STL_MIT_Movie_19_dev.tsv")

In [62]:
sentences = readCoNLL("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/results_emnlp/MIT_Restaurant_NER_ONLY_DIFFERENT_LEVEL/predictions/MIT_Restaurant_dev.conll", {0:'tokens', 1:'reference', 2:'hypothesis'})

In [63]:
reference = [ sentence['reference'] for sentence in sentences]
hypothesis = [ sentence['hypothesis'] for sentence in sentences]
    

In [64]:
evaluable = [zip(*sent_pair) for sent_pair in zip(reference, hypothesis)]
cm = confusion_matrix_conll(evaluable)
str(cm).split('\n') 
cm.classes
print(cm.classes)
print(cm.to_array())

['Amenity', 'Cuisine', 'O', 'Hours', 'Restaurant_Name', 'Rating', 'Location', 'Price', 'Dish']
[[367   8 167   0   2   2   2   1   1]
 [  9 438  43   6   2   0   0   0  21]
 [168  44   0  48  46  38 113  13  57]
 [  0   2  43 141   0   1   0   0   0]
 [  0   1  47   0 310   0   1   0   3]
 [  6   1  40   1   1 173   1   0   0]
 [  3   0 115   1   1   0 622   0   0]
 [  3   0  20   0   2   1   0 121   0]
 [  2  19  42   1  10   0   0   0 232]]


In [65]:
plot_confusion_matrix(cm.to_array(), cm.classes, 'x.pdf')

In [66]:
cm.classes

['Amenity',
 'Cuisine',
 'O',
 'Hours',
 'Restaurant_Name',
 'Rating',
 'Location',
 'Price',
 'Dish']

In [67]:
str(cm).split("\n")

['                |         Amenity         Cuisine            Dish           Hours        Location               O           Price          Rating Restaurant_Name',
 '----------------+------------------------------------------------------------------------------------------------------------------------------------------------',
 'Amenity         |             367               8               1               .               2             167               1               2               2',
 'Cuisine         |               9             438              21               6               .              43               .               .               2',
 'Dish            |               2              19             232               1               .              42               .               .              10',
 'Hours           |               .               2               .             141               .              43               .               1               .',
 'Lo