# Demo notebook for statistical analysis
SSC, September 2022

This demonstrates usage of data analysis methods.

In [None]:
# Please ignore this cell: extra install steps that are only executed when running the notebook on Google Colab
# flake8-noqa-cell
import os
if 'google.colab' in str(get_ipython()) and not os.path.isdir('Test_Data'):
    # we're running on colab and we haven't already downloaded the test data
    # first install pinned version of setuptools (latest version doesn't seem to work with this package on colab)
    !pip install setuptools==61 -qqq
    # install the moralization package
    !pip install git+https://github.com/ssciwr/moralization.git -qqq
    # download test data sets
    !wget https://github.com/ssciwr/moralization/archive/refs/heads/test_data.zip -q
    !unzip -qq test_data.zip && mv -f moralization-test_data/*_Data . && rm -rf moralization-test_data test_data.zip

In [None]:
import moralization as mn

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

Read in the data.

In [None]:
data_dict = mn.InputOutput.get_input_dir("Test_Data/XMI_11")

Report instances of occurence.

In [None]:
# i would guess from here one can get the correlation table somehow.
# just df.corr() yields unsatisfactory results..

# these DataFrames can now easily be saved as a csv file.
df_instances = mn.analyse.AnalyseOccurence(data_dict, mode="instances").df
df_instances.head(10)

In [None]:
# this df can now easily be filtered.
df_instances.loc["KAT2Subjektive_Ausdrcke"]

Report the spans of the instances.

In [None]:
df_spans = mn.analyse.AnalyseOccurence(data_dict, mode="spans").df
df_spans.head(10)

Further exploratory analysis.

In [None]:
mn.analyse.get_overlap_percent(
    "Forderer:in",
    "Neutral",
    data_dict,
    "test_data-trimmed_version_of-Gerichtsurteile-neg-AW-neu-optimiert-BB",
)

In [None]:
df = mn.analyse.get_percent_matrix(
    data_dict, "test_data-trimmed_version_of-Gerichtsurteile-neg-AW-neu-optimiert-BB"
)
plt.figure(figsize=(16, 16))
ax = sns.heatmap(df, cmap="cividis")

In [None]:
df_small = mn.analyse.get_percent_matrix(
    data_dict,
    "test_data-trimmed_version_of-Gerichtsurteile-neg-AW-neu-optimiert-BB",
    [
        "Appell",
        "Forderer:in",
        "soziale Gruppe",
        "Benefizient:in",
        "Neutral",
        "Institution",
    ],
)
plt.figure(figsize=(8, 8))
ax = sns.heatmap(df_small, cmap="cividis")