In [1]:
from cassis import *
from pathlib import Path
import os
import pandas as pd
import numpy as np
import krippendorff
import zipfile
from itertools import chain
from sklearn.metrics import cohen_kappa_score



In [2]:
def get_nested_key_by_annotator(annotator, key, dictionary):
    return dictionary[annotator][key]

def get_text_span_indices(span):
    return [span.begin, span.end]

def get_text_span_event_type_andre(span):
    return span.Events

def get_text_span_event_type_nuclei(span):
    return span.nuclei

def get_text_span_event_direction(span):
    return span.DirectionofChange

def spans_overlap(span_1, span_2):
    return set(list(range(*get_text_span_indices(span_1)))).intersection(list(range(*get_text_span_indices(span_2))))



Pilot 1 

In [39]:
from collections import Counter
counter = Counter(all_docs)
print(counter)
def count_by_frequency(counter_obj):
    """
    Given a Counter, return the number of items
    that occur exactly 3 times, 2 times, and 1 time.
    """
    counts = counter_obj.values()
    return {
        3: sum(1 for c in counts if c == 3),
        2: sum(1 for c in counts if c == 2),
        1: sum(1 for c in counts if c == 1),
        0: sum(1 for c in counts if c == 0)
    }
count_frequency = count_by_frequency(counter)

Counter({'29.txt': 3, '28.txt': 3, '17.txt': 3, '12.txt': 3, '11.txt': 3, 'narrative_20.txt': 3, 'narrative_19.txt': 3, 'narrative_18.txt': 3, 'narrative_16.txt': 3, '4.txt': 3, 'narrative_17.txt': 3, 'narrative_15.txt': 3, 'narrative_14.txt': 3, 'narrative_10.txt': 3, 'narrative_2.txt': 3, '3.txt': 3, '2.txt': 3, 'narrative_11.txt': 3, 'narrative_13.txt': 3, 'narrative_1.txt': 3, 'narrative_0.txt': 3, 'narrative_12.txt': 3, '26.txt': 3, '32.txt': 3, '33.txt': 3, '27.txt': 3, '31.txt': 3, '19.txt': 3, '18.txt': 3, '24.txt': 3, '30.txt': 3, 'narrative_23.txt': 2, 'narrative_22.txt': 2, 'narrative_21.txt': 2, 'narrative_25.txt': 2, 'narrative_24.txt': 2, 'narrative_26.txt': 2, '9.txt': 2, '8.txt': 2, '7.txt': 2, 'narrative_3.txt': 2, 'min_5.txt': 2, 'min_4.txt': 2, 'min_6.txt': 2, 'min_19.txt': 2, 'min_3.txt': 2, 'min_2.txt': 2, 'min_18.txt': 2, 'min_0.txt': 2, 'max_18.txt': 2, 'max_9.txt': 2, 'max_8.txt': 2, 'max_19.txt': 2, 'min_1.txt': 2, 'min_16.txt': 2, 'max_14.txt': 2, 'max_5.txt':

In [41]:
for c, num in count_frequency.items():
    print(f"{c}: {num/(31+48+20)} of {31+48+20}")


3: 0.31313131313131315 of 99
2: 0.48484848484848486 of 99
1: 0.20202020202020202 of 99
0: 0.0 of 99


In [37]:
names = ["aaron", "ida2024", "samuel"]

aaronlist = Path("outputs/pilot_2/annotation").rglob('aaron.zip')
samuellist = Path("outputs/pilot_2/annotation").rglob('samuel.zip')
idalist = Path("outputs/pilot_2/annotation").rglob('ida2024.zip')

aaron_annotated_docs = [str(path).split("/")[-2] for path in aaronlist]
samuel_annotated_docs = [str(path).split("/")[-2] for path in samuellist]
ida_annotated_docs = [str(path).split("/")[-2] for path in idalist]

overlap_annotated_docs = [doc for doc in aaron_annotated_docs if doc in samuel_annotated_docs and doc in ida_annotated_docs]
all_annotated_docs = list(set(aaron_annotated_docs + samuel_annotated_docs + ida_annotated_docs))

all_docs = aaron_annotated_docs + samuel_annotated_docs + ida_annotated_docs

overlap_pilot_1_docs = [doc for doc in overlap_annotated_docs if doc.startswith("narrative")]
overlap_pilot_2_docs = [doc for doc in overlap_annotated_docs if not doc.startswith("narrative")]
all_pilot_1_docs = [doc for doc in all_annotated_docs if doc.startswith("narrative")]
all_pilot_2_docs = [doc for doc in all_annotated_docs if not doc.startswith("narrative")]


print(f"In the first pilot study, we have {len(overlap_pilot_1_docs)} documents annotated by all annotators, {len(all_pilot_1_docs)} documents annotated by at least one annotator.")
print(f"In the second pilot study, we have {len(overlap_pilot_2_docs)} documents annotated by all annotators, {len(all_pilot_2_docs)} documents annotated by at least one annotator.")

names = ["ida2024", "samuel"]

samuellist = Path("outputs/pilot_3/annotation").rglob('samuel.zip')
idalist = Path("outputs/pilot_3/annotation").rglob('ida2024.zip')

samuel_annotated_docs = [str(path).split("/")[-2] for path in samuellist]
ida_annotated_docs = [str(path).split("/")[-2] for path in idalist]
overlap_pilot_3_docs = [doc for doc in samuel_annotated_docs if doc in ida_annotated_docs]
overlap_pilot_3_docs_min = [doc for doc in overlap_pilot_3_docs if "min" in doc]
overlap_pilot_3_docs_max = [doc for doc in overlap_pilot_3_docs if "max" in doc]
all_docs += samuel_annotated_docs + ida_annotated_docs
print(f"In the third pilot study, we have {len(overlap_pilot_3_docs)} documents annotated by all annotators. Min: {len(overlap_pilot_3_docs_min)}, Max: {len(overlap_pilot_3_docs_max)}")


In the first pilot study, we have 14 documents annotated by all annotators, 39 documents annotated by at least one annotator.
In the second pilot study, we have 17 documents annotated by all annotators, 20 documents annotated by at least one annotator.
In the third pilot study, we have 38 documents annotated by all annotators. Min: 18, Max: 20


In [None]:

annotation_dict = {"doc_id": [], "text": [], "annotator": [], "pilot": [], "doc_label": [], "triples": [], "spans": [], "relations": []}

docs_path = {
             "pilot_2" : [overlap_pilot_1_docs, overlap_pilot_2_docs],
             "pilot_3" : [overlap_pilot_3_docs_min+overlap_pilot_3_docs_max]
             }
df_pilot_all = []
helper = True
names = ["aaron", "ida2024", "samuel"]

    
annotation_dict = {"doc_id": [], "text": [], "annotator": [], "pilot": [], "doc_label": [], "triples": [], "spans": [], "relations": []}
for pilot, documents in docs_path.items():
    if "pilot_3" in pilot:
        names = ["ida2024", "samuel"]
    else:
        names = ["aaron", "ida2024", "samuel"]
    for j, docs in enumerate(documents):
        print(f"Processing {len(docs)} documents for {pilot}...")
        if j > 0:
            helper = False
        for i, doc in enumerate(docs):
            doc_id = doc.split(".")[0]
            with open(f'./outputs/{pilot}/annotation/{doc}/TypeSystem.xml', 'rb') as f:
                typesystem = load_typesystem(f)

            for name in names:
                #print(f"Processing {name} in document {doc} ({i+1}/{len(docs)})...")
                doc_labels = []
                spans = []
                relations = []

                try:
                    with open(f'./outputs/{pilot}/annotation/{doc}/{name}.xmi', 'rb') as f:
                        cas = load_cas_from_xmi(f, typesystem=typesystem)
                except FileNotFoundError:
                    print(f"File not found for {name} in document {doc}. Skipping.")
                    continue
                
                annotation_dict["annotator"].append(name)
                for doc_label in cas.select('custom.DocumentLabel'):
                    annotation_dict["doc_label"].append(doc_label.label)
                annotation_dict["spans"].append(cas.select('custom.Span'))
                annotation_dict["relations"].append(cas.select('custom.Relation'))
                triples = []
                for relation in cas.select('custom.Relation'):
                    if None in [relation.Governor.Events, relation.Dependent.DirectionofChange, relation.Dependent.Events]:
                        continue
                    else:
                        triples.append((relation.Governor.Events, relation.Dependent.DirectionofChange, relation.Dependent.Events))
                triples = set(triples)
                annotation_dict["doc_id"].append(doc_id)
                annotation_dict["triples"].append(triples)
                annotation_dict["text"].append(cas.sofa_string)
                if helper:
                    annotation_dict["pilot"].append("pilot_1")
                else:
                    annotation_dict["pilot"].append(pilot)
                    
        df = pd.DataFrame.from_dict(annotation_dict)
        df = df.replace({None: np.nan})
        df_pilot_all.append(df)
        reliability_data = [df[df["annotator"]==name].sort_values(by=['doc_id', 'text'], ascending=True)["doc_label"].values.tolist() for name in  names]
        print(reliability_data)
        print("K-alpha:", krippendorff.alpha(reliability_data=reliability_data, level_of_measurement="nominal"))
df_pilot_all = pd.concat(df_pilot_all, ignore_index=True)
df_pilot_all.to_csv("../data/annotated/pilot_annotation.csv", index=False)

Processing 14 documents for pilot_2...
[['inflation-related', 'inflation-related', 'inflation-related', 'non-inflation-related', 'non-inflation-related', 'inflation-cause-related', 'non-inflation-related', 'inflation-cause-related', 'non-inflation-related', 'non-inflation-related', 'inflation-related', 'inflation-related', 'non-inflation-related', 'non-inflation-related'], ['inflation-related', 'inflation-related', 'inflation-cause-related', 'inflation-cause-related', 'non-inflation-related', 'inflation-cause-related', 'inflation-cause-related', 'inflation-cause-related', 'non-inflation-related', 'inflation-cause-related', 'inflation-related', 'inflation-related', 'non-inflation-related', 'non-inflation-related'], ['inflation-related', 'inflation-cause-related', 'inflation-cause-related', 'inflation-related', 'non-inflation-related', 'inflation-cause-related', 'inflation-cause-related', 'inflation-cause-related', 'non-inflation-related', 'non-inflation-related', 'inflation-cause-relate

In [27]:
df_pilot_all[df_pilot_all["pilot"]=="pilot_1"].groupby("annotator")["doc_label"].value_counts(normalize=True)

annotator  doc_label              
aaron      non-inflation-related      0.500000
           inflation-related          0.357143
           inflation-cause-related    0.142857
ida2024    inflation-cause-related    0.428571
           inflation-related          0.285714
           non-inflation-related      0.285714
samuel     inflation-cause-related    0.500000
           non-inflation-related      0.285714
           inflation-related          0.214286
Name: proportion, dtype: float64

Proportion of inflation cause dominant
1. pilot 1: 0.357
2. pilot 2: 0.106
3. pilot 3: 0.356

In [6]:
reliability_data = [df[df["pilot"].isin(["pilot_3"])][df["annotator"]==name].sort_values(by='doc_id', ascending=True)["doc_label"].values.tolist() for name in  names]
print(reliability_data)
print("K-alpha:", krippendorff.alpha(reliability_data=reliability_data, level_of_measurement="nominal"))

[['inflation-related', 'inflation-related', 'inflation-cause-dominant', 'non-inflation-related', 'inflation-cause-dominant', 'inflation-related', 'inflation-related', 'inflation-related', 'non-inflation-related', 'inflation-cause-dominant', 'inflation-related', 'non-inflation-related', 'inflation-related', 'inflation-cause-dominant', 'non-inflation-related', 'non-inflation-related', 'inflation-related', nan, 'inflation-related', nan, 'inflation-related', 'inflation-related', 'inflation-cause-dominant', 'non-inflation-related', 'inflation-cause-dominant', 'inflation-related', 'inflation-related', 'inflation-related', 'non-inflation-related', 'inflation-cause-dominant', 'inflation-related', 'non-inflation-related', 'inflation-related', 'inflation-cause-dominant', 'non-inflation-related', 'non-inflation-related', 'inflation-related', 'inflation-related'], ['inflation-cause-dominant', 'inflation-cause-dominant', 'inflation-cause-dominant', 'non-inflation-related', 'inflation-cause-dominant

  reliability_data = [df[df["pilot"].isin(["pilot_3"])][df["annotator"]==name].sort_values(by='doc_id', ascending=True)["doc_label"].values.tolist() for name in  names]
  reliability_data = [df[df["pilot"].isin(["pilot_3"])][df["annotator"]==name].sort_values(by='doc_id', ascending=True)["doc_label"].values.tolist() for name in  names]


In [7]:
df = df_pilot_all.sort_values(by=['text','pilot'], ascending=True)
df

Unnamed: 0,doc_id,text,annotator,pilot,doc_label,triples,spans,relations
260,min_8,&apos;Moderate&apos; Recession In Offing For C...,ida2024,pilot_3,inflation-related,{},[],[]
261,min_8,&apos;Moderate&apos; Recession In Offing For C...,samuel,pilot_3,inflation-cause-dominant,"{(Monetary Policy, Increase, Inflation)}","[c.Span(Events=Monetary Policy, Is_Nuclei=Cons...",[c.Relation(Dependent=c.Span(DirectionofChange...
268,max_8,&apos;Moderate&apos; Recession In Offing For C...,ida2024,pilot_3,inflation-related,{},[],[]
269,max_8,&apos;Moderate&apos; Recession In Offing For C...,samuel,pilot_3,inflation-cause-dominant,"{(Monetary Policy, Decrease, Inflation)}","[c.Span(Events=Monetary Policy, Is_Nuclei=Cons...",[c.Relation(Dependent=c.Span(DirectionofChange...
84,29,3M to Cut Costs as Business Cools Beyond Masks...,aaron,pilot_2,non-inflation-related,{},[],[]
...,...,...,...,...,...,...,...,...
106,2,Will Stocks Be a Trade-War Casualty? -- Barron...,ida2024,pilot_2,inflation-related,{},[],[]
107,2,Will Stocks Be a Trade-War Casualty? -- Barron...,samuel,pilot_2,non-inflation-related,{},[],[]
198,2,Will Stocks Be a Trade-War Casualty? -- Barron...,aaron,pilot_2,non-inflation-related,{},[],[]
199,2,Will Stocks Be a Trade-War Casualty? -- Barron...,ida2024,pilot_2,inflation-related,{},[],[]


In [8]:
df_pilot_all[df_pilot_all["annotator"]=="ida2024"].text.value_counts()

text
USDA National Pork Midday FOB Plant Report - Oct 10 \n\n\n\nLM_PK600 \n \nDes Moines, IA              Wed, Oct 10, 2018                  USDA Market News \nNATIONAL DAILY PORK REPORT FOB PLANT - Negotiated Sales - Morning \n \n-------------------------------------------------------------------------------- \nLoads PORK CUTS          :     173.61 \nLoads TRIM/PROCESS PORK  :     27.32 \n-------------------------------------------------------------------------------- \nUSDA ESTIMATED PORK CUT-OUT VALUES - as of 9:30am \nBased on negotiated prices and volume of pork cuts delivered within 14 days \nand on average industry cutting yields.  Values reflect U.S. dollars per \n100 pounds. \nCalculations for 205 lb Pork Carcass. 53-54% lean, 0.65"-0.80" BF Last Rib \n-------------------------------------------------------------------------------- \n                      Today's Estimated Primal Cutout \n \nDate            Loads  Carcass    Loin    Butt     Pic     Rib     Ham   Belly \n----