## Export:


So far the only export I could get working

In [None]:
from cassis import load_typesystem, load_cas_from_xmi
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Documentation for cassis
# https://github.com/dkpro/dkpro-cassis


# might be usefull:
# https://spacy.io/api/textcategorizer
# https://spacy.io/universe/project/classyclassification
# https://stackoverflow.com/questions/62075223/how-to-improve-a-german-text-classification-model-in-spacy

In [None]:
# Here we load the filesystem and the actual annotated text file.
# The file system includes all possible types that are configured in inception.
# This should only need one file for the entire dataset.

with open("../data/TypeSystem.xml", "rb") as f:
    ts = load_typesystem(f)

with open("../data/Gerichtsurteile-pos-AW-neu-optimiert-BB.xmi", "rb") as f:
    cas = load_cas_from_xmi(f, typesystem=ts)

In [None]:
# report the entrys of the span categories:


def report_span_cat(span):
    return {
        "Protagonistinnen": span.Protagonistinnen,
        "Protagonistinnen2": span.Protagonistinnen2,
        "Protagonistinnen3": span.Protagonistinnen3,
        "Forderung": span.Forderung,
        "KAT1MoralisierendesSegment": span.KAT1MoralisierendesSegment,
        "KAT2Subjektive_Ausdrcke": span.KAT2Subjektive_Ausdrcke,
        # span.KAT5Ausformulierung, # this seems to be single use comments
        "KOMMENTAR": span.KOMMENTAR,
        "KommunikativeFunktion": span.KommunikativeFunktion,
        "Moralwerte": span.Moralwerte,
    }

In [None]:
# select all custom Spans and store them in an ordered dict,
# where the first dimension is the used inception category (Protagonistinnen, Forderung, etc...)
# and the second dimension is the corresponding value of this category ('Forderer:in', 'Adresassat:in', 'Benefizient:in')
# dict[category][entry value] = span
def sort_spans(cas, ts):
    span_type = ts.get_type("custom.Span")
    span_dict = defaultdict(lambda: defaultdict(list))
    for span in cas.select(span_type.name):

        for item_name, item_value in report_span_cat(span).items():
            if item_value:
                span_dict[item_name][item_value].append(span)

    for span_dict_key, span_dict_sub_kat in span_dict.items():
        print(f"{span_dict_key}: {[key for key in span_dict_sub_kat.keys()]}")
    return span_dict

In [None]:
# find the overlaying category for an second dimension cat name
def find_cat_from_str(cat_entry, span_dict):

    for span_dict_key, span_dict_sub_kat in span_dict.items():
        if cat_entry in span_dict_sub_kat.keys():
            return span_dict_key

In [None]:
# get overlap%


def get_overlap_percent(cat_1, cat_2, span_dict, ret_occ=False):
    o_cat1 = find_cat_from_str(cat_1, span_dict)
    o_cat2 = find_cat_from_str(cat_2, span_dict)

    occurence = 0
    total = 0
    for span in span_dict[o_cat1][cat_1]:
        total += 1
        if report_span_cat(span)[o_cat2] == cat_2:
            occurence += 1
    if ret_occ:
        return occurence, total
    else:
        return round(occurence / total, 7)

In [None]:
def get_percent_matrix(span_dict, cat_list=None):
    if cat_list is None:
        cat_list = []
        for span_dict_key, span_dict_sub_kat in span_dict.items():
            [cat_list.append(key) for key in span_dict_sub_kat.keys()]

    percent_matrix = np.zeros((len(cat_list), len(cat_list)))
    for i, cat1 in enumerate(cat_list):
        for j, cat2 in enumerate(cat_list):
            percent_matrix[i, j] = get_overlap_percent(cat1, cat2, span_dict)

    df = pd.DataFrame(percent_matrix, index=cat_list)
    df.columns = cat_list
    return df

In [None]:
span_dict = sort_spans(cas, ts)

find_cat_from_str("Liberty", span_dict)

In [None]:
get_overlap_percent("Forderer:in", "Neutral", span_dict)

In [None]:
df = get_percent_matrix(span_dict)
plt.figure(figsize=(16, 16))
ax = sns.heatmap(df, cmap="cividis")

In [None]:
df_small = get_percent_matrix(
    span_dict,
    [
        "Appell",
        "Adresassat:in",
        "Forderer:in",
        "soziale Gruppe",
        "Benefizient:in",
        "Neutral",
        "Institution",
        "Expression",
    ],
)
plt.figure(figsize=(8, 8))
ax = sns.heatmap(df_small, cmap="cividis")