In [21]:
true_data = [
    ["T", "O", "O", "O", "O", "O", "O", "O", "O", "T", "O", "O", "O", "O", "O", "O"]
]
predicted = [
    ["T", "O", "O", "O", "O", "O", "O", "O", "O", "T", "O", "O", "O", "O", "O", "O"]
]

In [19]:
from utils import evaluate_ts


evaluate_ts(true_data, predicted)

(0.0, 0.0, 0.0, 0.0)

In [23]:
import numpy as np

SMALL_POSITIVE_CONST = 1e-8


def tag2ts(ts_tag_sequence):
    """
    Transform ts tag sequence to target spans
    :param ts_tag_sequence: tag sequence with 'T' and 'O'
    :return: List of (start, end) tuples for target spans
    """
    n_tags = len(ts_tag_sequence)
    ts_sequence = []
    beg, end = -1, -1
    for i in range(n_tags):
        ts_tag = ts_tag_sequence[i]
        if ts_tag == "T":
            if beg == -1:
                beg = i
            end = i
        elif ts_tag == "O" and beg != -1:
            ts_sequence.append((beg, end))
            beg, end = -1, -1
    if beg != -1:
        ts_sequence.append((beg, end))
    return ts_sequence


def match_ts(gold_ts_sequence, pred_ts_sequence):
    """
    Calculate the number of correctly predicted target spans
    :param gold_ts_sequence: gold standard target spans
    :param pred_ts_sequence: predicted target spans
    :return: hit_count, gold_count, pred_count
    """
    hit_count = 0
    gold_count = len(gold_ts_sequence)
    pred_count = len(pred_ts_sequence)

    for t in pred_ts_sequence:
        if t in gold_ts_sequence:
            hit_count += 1

    return hit_count, gold_count, pred_count


def evaluate_ts(gold_ts, pred_ts):
    """
    Evaluate the model performance for the binary tagging task
    :param gold_ts: gold standard ts tags
    :param pred_ts: predicted ts tags
    :return: Precision, Recall, F1 scores
    """
    assert len(gold_ts) == len(pred_ts)
    n_samples = len(gold_ts)

    n_tp_ts, n_gold_ts, n_pred_ts = 0, 0, 0

    for i in range(n_samples):
        g_ts_sequence = tag2ts(ts_tag_sequence=gold_ts[i])
        p_ts_sequence = tag2ts(ts_tag_sequence=pred_ts[i])

        hit_ts_count, gold_ts_count, pred_ts_count = match_ts(
            gold_ts_sequence=g_ts_sequence, pred_ts_sequence=p_ts_sequence
        )

        n_tp_ts += hit_ts_count
        n_gold_ts += gold_ts_count
        n_pred_ts += pred_ts_count

    precision = float(n_tp_ts) / (n_pred_ts + SMALL_POSITIVE_CONST)
    recall = float(n_tp_ts) / (n_gold_ts + SMALL_POSITIVE_CONST)
    f1_score = 2 * precision * recall / (precision + recall + SMALL_POSITIVE_CONST)

    return precision, recall, f1_score


# Example usage
true_data = [
    ["T", "O", "O", "O", "O", "O", "O", "O", "O", "T", "O", "O", "O", "O", "O", "O"],
    ["T", "O", "O", "O", "O", "T", "O"],
]
predicted = [
    ["T", "O", "O", "O", "O", "O", "O", "O", "O", "T", "O", "O", "O", "O", "O", "O"],
    ["T", "T", "O", "O", "O", "O", "O"],
]

precision, recall, f1 = evaluate_ts(true_data, predicted)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Precision: 0.6666666644444444, Recall: 0.49999999875, F1 Score: 0.5714285648979592
