In [None]:
# Lint as: python3
"""Example tagging for Toxic Spans based on Spacy.
Requires:
  pip install spacy sklearn
Install models:
  python -m spacy download en_core_web_sm
"""

import ast
import csv
import random
import statistics
import sys

import sklearn
import spacy

sys.path.append('../evaluation')
# import semeval2021
# import fix_spans

In [None]:
def _contiguous_ranges(span_list):
    """Extracts continguous runs [1, 2, 3, 5, 6, 7] -> [(1,3), (5,7)]."""
    output = []
    for _, span in itertools.groupby(
        enumerate(span_list), lambda p: p[1] - p[0]):
        span = list(span)
        output.append((span[0][1], span[-1][1]))
    return output

import ast
import csv
import itertools
import string
import sys

SPECIAL_CHARACTERS = string.whitespace
def fix_spans(spans, text, special_characters=SPECIAL_CHARACTERS):
    """Applies minor edits to trim spans and remove singletons."""
    cleaned = []
    for begin, end in _contiguous_ranges(spans):
        while text[begin] in special_characters and begin < end:
            begin += 1
        while text[end] in special_characters and begin < end:
            end -= 1
        if end - begin > 1:
            cleaned.extend(range(begin, end + 1))
    return cleaned

In [None]:
def read_datafile(filename):
  """Reads csv file with python span list and text."""
  data = []
  with open(filename) as csvfile:
    reader = csv.DictReader(csvfile)
    count = 0
    for row in reader:
      fixed = fix_spans(
          ast.literal_eval(row['spans']), row['text'])
      data.append((fixed, row['text']))
  return data

In [None]:
def f1(predictions, gold):
    """
    F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
    >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
    :param predictions: a list of predicted offsets
    :param gold: a list of offsets serving as the ground truth
    :return: a score between 0 and 1
    """
    if len(gold) == 0:
        return 1. if len(predictions) == 0 else 0.
    if len(predictions) == 0:
        return 0.
    predictions_set = set(predictions)
    gold_set = set(gold)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)

In [None]:
print('loading actual data')
act = read_datafile('/content/drive/MyDrive/Submission/devfinal.csv')

print('loading predictions data')
res = read_datafile('/content/drive/MyDrive/Submission/pred.csv')

loading actual data
loading predictions data


In [None]:
print('evaluation')
scores = []

for i in range(len(act)):
  pred = res[i][0]
  span = act[i][0]

  score = f1(pred, span)
  scores.append(score)

print('avg F1 %g' % statistics.mean(scores))

evaluation
avg F1 0.618362
