# Demo notebook for statistical analysis
SSC, September 2022

This demonstrates usage of data analysis methods.

In [None]:
# Please ignore this cell: extra install steps that are only executed when running the notebook on Google Colab
# flake8-noqa-cell
import os
if 'google.colab' in str(get_ipython()) and not os.path.isdir('Test_Data'):
    # we're running on colab and we haven't already downloaded the test data
    # first install pinned version of setuptools (latest version doesn't seem to work with this package on colab)
    !pip install setuptools==61 -qqq
    # install the moralization package
    !pip install git+https://github.com/ssciwr/moralization.git -qqq
    # download test data sets
    !wget https://github.com/ssciwr/moralization/archive/refs/heads/test_data.zip -q
    !mkdir -p data && unzip -qq test_data.zip && mv -f moralization-test_data/*_Data ./data/. && rm -rf moralization-test_data test_data.zip

In [None]:
from moralization import input as inp
from moralization import analyse as ae
from moralization import plot as pl

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## Read in the data.

In [None]:
data_dict = inp.InputOutput.read_data("../data/Test_Data/XMI_11")

## Report instances of occurrence.

In [None]:
df_instances = ae.AnalyseOccurrence(data_dict, mode="instances").df
# save the data frame
df_instances.to_csv("instances_out.csv")
df_instances.head(10)

## Report spans

In [None]:
df_spans = ae.AnalyseOccurrence(data_dict, mode="spans").df
# save the data frame
df_spans.to_csv("spans_out.csv")
df_spans.head(10)

## Report index of the spans

In [None]:
df_span_index = ae.AnalyseOccurrence(data_dict, mode="span_index").df
# save the data frame
df_span_index.to_csv("span_index_out.csv")
df_span_index.head(10)

### These dataframes can now easily be filtered

In [None]:
df_instances.loc["KAT2-Moralwerte"]

## Report which categories occur how often in which paragraph

Warning: This is a long list and is best exported to excel for viewing.

In [None]:
df_paragraphs = ae.AnalyseSpans.report_occurrence_per_paragraph(data_dict)

df_paragraphs.head(5)

## Report correlation matrix for paragraph occurrence
### Pearson correlation for pairs of values

In [None]:
df_corr = pl.PlotSpans.report_occurrence_matrix(df_paragraphs)
df_corr

### This matrix can also be easily filtered both on main and sub categories

In [None]:
df_corr_filtered = pl.PlotSpans.report_occurrence_matrix(
    df_paragraphs, filter_=["KAT1-Moralisierendes Segment", "Neutral", "Care"]
)
df_corr_filtered

### Plot correlation matrix as heatmap

In [None]:
heatmap = pl.PlotSpans.report_occurrence_heatmap(df_paragraphs)
heatmap

### The heatmap can be filtered the same way as the matrix.

In [None]:
heatmap_filtered = pl.PlotSpans.report_occurrence_heatmap(
    df_paragraphs, filter_=["KAT1-Moralisierendes Segment", "Neutral", "Care"]
)
heatmap_filtered