## Export:


So far the only export I could get working

In [None]:
from cassis import load_typesystem, load_cas_from_xmi
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os

# Documentation for cassis
# https://github.com/dkpro/dkpro-cassis


# might be usefull:
# https://spacy.io/api/textcategorizer
# https://spacy.io/universe/project/classyclassification
# https://stackoverflow.com/questions/62075223/how-to-improve-a-german-text-classification-model-in-spacy

In [None]:
# Here we load the filesystem and the actual annotated text file.
# The file system includes all possible types that are configured in inception.
# This should only need one file for the entire dataset.

with open("../data/TypeSystem.xml", "rb") as f:
    ts = load_typesystem(f)

with open("../data/Gerichtsurteile-pos-AW-neu-optimiert-BB.xmi", "rb") as f:
    cas = load_cas_from_xmi(f, typesystem=ts)

In [None]:
# select all custom Spans and store them in an ordered dict,
# where the first dimension is the used inception category (Protagonistinnen, Forderung, etc...)
# and the second dimension is the corresponding value of this category ('Forderer:in', 'Adresassat:in', 'Benefizient:in')
# dict[category][entry value] = span
def sort_spans(cas, ts):

    span_type = ts.get_type("custom.Span")
    span_dict = defaultdict(lambda: defaultdict(list))

    # list of all interesting categories
    cat_list = [
        "Protagonistinnen",
        "Protagonistinnen2",
        "Protagonistinnen3",
        "Forderung",
        "KAT1MoralisierendesSegment",
        "KAT2Subjektive_Ausdrcke",
        "KOMMENTAR",
        "KommunikativeFunktion",
        "Moralwerte",
    ]

    for span in cas.select(span_type.name):

        for cat in cat_list:
            # this excludes any unwanted datapoints
            if span[cat] and span["KOMMENETAR"] != "Dopplung":

                span_dict[cat][span[cat]].append(span)

    # for span_dict_key, span_dict_sub_kat in span_dict.items():
    #     print(f"{span_dict_key}: {[key for key in span_dict_sub_kat.keys()]}")
    return span_dict

In [None]:
### load multiple files into a list of dictionaries
def load_data_dir(dir_path):
    data_files = glob.glob(os.path.join(dir_path, "*.xmi"))

    with open(glob.glob(os.path.join(dir_path, "*.xml"))[0], "rb") as f:
        ts = load_typesystem(f)
    data_dict_list = {}
    for data_file in data_files:
        # the wikipediadiskussionen file breaks as it has an invalid xmi charakter.
        if data_file != "../data/Wikipediadiskussionen-neg-BD-neu-optimiert-CK.xmi":
            with open(data_file, "rb") as f:
                cas = load_cas_from_xmi(f, typesystem=ts)
            data_dict_list[os.path.basename(data_file).split(".xmi")[0]] = {
                "data": sort_spans(cas, ts),
                "file_type": os.path.basename(data_file).split(".")[1],
            }

    return data_dict_list

In [None]:
# find the overlaying category for an second dimension cat name
def find_cat_from_str(cat_entry, span_dict):

    for span_dict_key, span_dict_sub_kat in span_dict.items():
        if cat_entry in span_dict_sub_kat.keys():
            return span_dict_key

In [None]:
# get overlap%
# so far this only works on a span basis and not a sentance basis.


def get_overlap_percent(cat_1, cat_2, data_dict_list, file_name, ret_occ=False):
    o_cat1 = find_cat_from_str(cat_1, data_dict_list[file_name]["data"])
    o_cat2 = find_cat_from_str(cat_2, data_dict_list[file_name]["data"])

    occurence = 0
    total = 0
    for span in data_dict_list[file_name]["data"][o_cat1][cat_1]:
        total += 1
        if span[o_cat2] == cat_2:
            occurence += 1
    if ret_occ:
        return occurence, total
    else:
        return round(occurence / total, 7)

In [None]:
def get_percent_matrix(data_dict_list, file_name, cat_list=None):
    if cat_list is None:
        cat_list = []
        for span_dict_key, span_dict_sub_kat in data_dict_list[file_name][
            "data"
        ].items():
            [cat_list.append(key) for key in span_dict_sub_kat.keys()]

    percent_matrix = np.zeros((len(cat_list), len(cat_list)))
    for i, cat1 in enumerate(cat_list):
        for j, cat2 in enumerate(cat_list):
            percent_matrix[i, j] = get_overlap_percent(
                cat1, cat2, data_dict_list, file_name
            )

    df = pd.DataFrame(percent_matrix, index=cat_list)
    df.columns = cat_list
    return df

In [None]:
# mode can be "instances" or "span"
# instances reports the number of occurences and span


def report_instances(data_dict_list, file_names=None):

    if file_names is None:
        file_names = list(data_dict_list.keys())
    elif isinstance(file_names, str):
        file_names = [file_names]

    # filename: main_cat: sub_cat: instances
    instance_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    for file_name in file_names:
        span_dict = data_dict_list[file_name]["data"]
        # initilize total instances rows for easier setting later.
        instance_dict[file_name][("total instances", "with invalid")] = 0
        instance_dict[file_name][("total instances", "without invalid")] = 0

        for main_cat_key, main_cat_value in span_dict.items():
            for sub_cat_key, sub_cat_value in main_cat_value.items():
                # the tuple index makes it easy to convert the dict into a pandas dataframe
                instance_dict[file_name][(main_cat_key, sub_cat_key)] = len(
                    sub_cat_value
                )

    df = pd.DataFrame(instance_dict)
    df.index = df.index.set_names((["Main Category", "Sub Category"]))

    # add rows for total instances
    df.loc[("total instances", "with invalid"), :] = df.sum(axis=0).values
    df.loc[("total instances", "without invalid"), :] = (
        df.loc[("total instances", "with invalid"), :].values
        - df.loc["KAT1MoralisierendesSegment", "Keine Moralisierung"].values
    )

    # sort by index and occurence number
    df = df.sort_values(
        by=[
            "Main Category",
            "Sub Category",
            file_names[0],
        ],
        ascending=False,
    )

    # fill NaN
    df = df.fillna(0)
    return df

In [None]:
def report_spans(data_dict_list, file_names=None):

    if file_names is None:
        file_names = list(data_dict_list.keys())
    elif isinstance(file_names, str):
        file_names = [file_names]

    df_spans = report_instances(data_dict_list, file_names)
    # this report_instances call makes it much easier to include the total number of spans for each columns, as well as removes the need to duplicate the pandas setup.

    df_spans[:] = df_spans[:].astype("object")
    for file_name in file_names:
        span_dict = data_dict_list[file_name]["data"]

        for main_cat_key, main_cat_value in span_dict.items():
            for sub_cat_key, sub_cat_value in main_cat_value.items():

                # multiple options for how to report the spans are available

                # first report the entire span object as a string
                span_list = [str(span) for span in span_dict[main_cat_key][sub_cat_key]]
                # this would look like this:
                # c.Span(Protagonistinnen=Forderer:in, Protagonistinnen2=Individuum, Protagonistinnen3=Own Group, begin=21822, end=21874);
                # c.Span(Protagonistinnen=Benefizient:in, Protagonistinnen2=Institution, Protagonistinnen3=Own Group, begin=21974, end=21984);
                # c.Span(Protagonistinnen=Forderer:in, Protagonistinnen2=Institution, Protagonistinnen3=Own Group, begin=66349, end=66352)
                # maybe one should remove the c.Span() but i'm not sure what exactly is wanted here.

                # second option is to report the end or beginning index for each span
                # span_list=[str(span["end"]) for span in span_dict[main_cat_key][sub_cat_key] ]

                # convert list to seperated str

                span_str = ";".join(span_list)
                span_str = span_str.replace("[", "").replace("]", "")

                df_spans.at[
                    (main_cat_key, sub_cat_key),
                    file_name,
                ] = span_str

    return df_spans

In [None]:
data_dict_list = load_data_dir("../data")

In [None]:
# i would guess from here one can get the correlation table somehow.
# just df.corr() yields unsatisfactory results..

# these DataFrames can now easily be saved as a csv file.
df_instances = report_instances(data_dict_list)
df_instances.head(10)

In [None]:
# this df can now easily be filtered.
df_instances.loc["KAT2Subjektive_Ausdrcke"]

In [None]:
df_spans = report_spans(data_dict_list)
df_spans.head(10)

In [None]:
get_overlap_percent(
    "Forderer:in", "Neutral", data_dict_list, "Gerichtsurteile-neg-AW-neu-optimiert-BB"
)

In [None]:
df = get_percent_matrix(data_dict_list, "Gerichtsurteile-neg-AW-neu-optimiert-BB")
plt.figure(figsize=(16, 16))
ax = sns.heatmap(df, cmap="cividis")

In [None]:
df_small = get_percent_matrix(
    data_dict_list,
    "Gerichtsurteile-neg-AW-neu-optimiert-BB",
    [
        "Appell",
        "Adresassat:in",
        "Forderer:in",
        "soziale Gruppe",
        "Benefizient:in",
        "Neutral",
        "Institution",
        "Expression",
    ],
)
plt.figure(figsize=(8, 8))
ax = sns.heatmap(df_small, cmap="cividis")