# Demo notebook for statistical analysis
SSC, September 2022

This demonstrates usage of data analysis methods.

In [None]:
# Please ignore this cell: extra install steps that are only executed when running the notebook on Google Colab
# flake8-noqa-cell
import os
if 'google.colab' in str(get_ipython()) and not os.path.isdir('Test_Data'):
    # we're running on colab and we haven't already downloaded the test data
    # first install pinned version of setuptools (latest version doesn't seem to work with this package on colab)
    !pip install setuptools==61 -qqq
    # install the moralization package
    !pip install git+https://github.com/ssciwr/moralization.git -qqq
    # download test data sets
    !wget https://github.com/ssciwr/moralization/archive/refs/heads/test_data.zip -q
    !unzip -qq test_data.zip && mv -f moralization-test_data/*_Data . && rm -rf moralization-test_data test_data.zip

In [None]:
import moralization as mn

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

Read in the data.

In [None]:
data_dict = mn.InputOutput.get_input_dir("../../moralization_data/Test_Data/XMI_11")

Report instances of occurence.

In [None]:
# i would guess from here one can get the correlation table somehow.
# just df.corr() yields unsatisfactory results..

# these DataFrames can now easily be saved as a csv file.
df_instances = mn.analyse.AnalyseOccurence(data_dict, mode="instances").df
df_instances.head(10)

In [None]:
# this df can now easily be filtered.
df_instances.loc["KAT2Subjektive_Ausdrcke"]

Report the spans of the instances.

In [None]:
df_spans = mn.analyse.AnalyseOccurence(data_dict, mode="spans").df
df_spans.head(10)

Report which categories are in which sentence.

Warning: This is a long list and is best exported to excel for viewing.

(use for example:  `df_sentences.csv("sentence_cat.csv") `)

In [None]:
df_paragraphs = mn.analyse.report_occurence_per_sentence(data_dict, mode="paragraphs")
print(len(df_paragraphs))
df_paragraphs.head()

In [None]:
df_sentences = mn.analyse.report_occurence_per_sentence(data_dict, mode="sentences")
print(len(df_sentences))
df_sentences.head()

Report correlation matrix for sentence occurence.

this can also be saved using `df_corr.to_csv("cat_corr.csv")`

In [None]:
df_corr = mn.analyse.report_occurence_matrix(df_sentences)
df_corr

This matrix can also be easily filtered both on main and sub categories

In [None]:
df_corr_filtered = mn.analyse.report_occurence_matrix(
    df_sentences, filter=["KAT1MoralisierendesSegment", "Neutral", "Care"]
)
df_corr_filtered

One can also view the correlation matrix as a heatmap

In [None]:
heatmap = mn.analyse.report_occurence_heatmap(df_sentences)
heatmap

The heatmap can be filtered the same way as the matrix.

In [None]:
heatmap_filtered = mn.analyse.report_occurence_heatmap(
    df_sentences, filter=["KAT1MoralisierendesSegment", "Neutral", "Care"]
)
heatmap_filtered