# evaluate_annotation.py

**-> I ran this code and it works!**

Assumes that annotations folder contains:

*   eng/annotationsheet_" + term +"*.tsv"
*   nld/annotationsheet_" + term +"*.tsv"

for term in eng_terms and for term in nld_terms


In [None]:
# evaluate_annotation.py
# Roderick Li and Jasmijn Cnossen
# Language as Data
# December 2021

# import all packages
import pandas as pd
import glob
import os.path
from itertools import combinations
from sklearn.metrics import cohen_kappa_score, confusion_matrix

eng_terms = ["abortion", "life", "rights", "law"]
nld_terms = ["abortus", "leven", "rechten", "wet"]
terms = [["abortion", "life", "rights", "law"], ["abortus", "leven", "rechten", "wet"]]
categories = ["pro", "anti", "neutral"]
languages = ["eng", "nld"]
prop_agree = []
cohens_kappa = []

### ENGLISH ###

matrices_eng = []

for term in eng_terms:
    annotations = {}

    # Read in the data
    for sheet in glob.glob("annotations/eng/annotationsheet_" + term +"*.tsv"):
        filename, extension = os.path.basename(sheet).split(".")
        prefix, term, annotator, affix = filename.split("_")

        # Read in annotations
        annotation_data = pd.read_csv(sheet, sep="\t", header=0, keep_default_na=False)
        annotations[annotator] = annotation_data["Annotation"]

    annotators = annotations.keys()

    for annotator_a, annotator_b in combinations(annotators, 2):
        agreement = [anno1 == anno2 for anno1, anno2 in zip(annotations[annotator_a], annotations[annotator_b])]
        proportion = sum(agreement)/len(agreement)
        prop_agree.append(proportion)
        kappa = cohen_kappa_score(annotations[annotator_a], annotations[annotator_b], labels=categories)
        cohens_kappa.append(kappa)
        confusions = confusion_matrix(annotations[annotator_a], annotations[annotator_b], labels=categories)
        matrices_eng.append(confusions)


### DUTCH ###

matrices_nld = []

for term in nld_terms:
    annotations = {}

    # Read in the data
    for sheet in glob.glob("annotations/nld/annotationsheet_" + term +"*.tsv"):
        filename, extension = os.path.basename(sheet).split(".")
        prefix, term, annotator, affix = filename.split("_")

        # Read in annotations
        annotation_data = pd.read_csv(sheet, sep="\t", header=0, keep_default_na=False)
        annotations[annotator] = annotation_data["Annotation"]

    annotators = annotations.keys()

    for annotator_a, annotator_b in combinations(annotators, 2):
        agreement = [anno1 == anno2 for anno1, anno2 in zip(annotations[annotator_a], annotations[annotator_b])]
        proportion = sum(agreement)/len(agreement)
        prop_agree.append(proportion)
        kappa = cohen_kappa_score(annotations[annotator_a], annotations[annotator_b], labels=categories)
        cohens_kappa.append(kappa)
        confusions = confusion_matrix(annotations[annotator_a], annotations[annotator_b], labels=categories)
        matrices_nld.append(confusions)

### OUTPUT INTERANNOTATOR AGREEMENT ###

# "Proportion agreement": [0.80, 0.76, 0.84, 0.78, 0.75, 0.67, 0.88, 1.00]
# "Cohen's Kappa": [0.41, 0.48, 0.31, 0.27, -0.14, -0.14, 0.60, 1.00]

stats = {"Proportion agreement": prop_agree, "Cohen's Kappa": cohens_kappa}
stats_table = pd.DataFrame(stats, index=["abortion", "life", "rights", "law", "abortus", "leven", "rechten", "wet"])
print(stats_table)