In [None]:
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 16)

In [None]:
import pandas as pd

In [None]:
DATASETS: list = ['train', 'test']

In [None]:
MODELS: list = [ 'base', 'textattack', 'fabriceyhc', 'wakaka']

In [None]:
### Load Datasets into memory

In [None]:
analysis: dict = {
    data: (
        pd
        .read_csv(f'analysis.{data}.{model}.csv', index_col=0)
        .set_index(['dim'])
        .add_prefix(f'{model}_')
    )
    for model in MODELS
    for data in DATASETS
}

In [None]:
for label, data in analysis.items():
    display(label, data)

In [None]:
### Calculate Centroid Distance and Cluster Dispersion

In [None]:
distances: dict = {
    label: (
        data
        .filter(regex=".*_distance")
        .describe()
        .T
    )
    for label, data in analysis.items()
}

In [None]:
for label, data in distances.items():
    formatted: pd.DataFrame = (
        data
        [['mean', 'std', 'min', 'max']]
        .set_index(pd.Index(list(zip(*MODELS))[0]))
        .round(3)
    )
    formatted.to_csv(f'{label}.metric.distance.csv')
    display(formatted)

In [None]:
dispersion: dict = {
    label: (
        data
        .filter(regex=".*_dispersion")
        .describe()
        .T
        .set_index(
        pd.MultiIndex.from_tuples(
            [
                (model, group)
                for model in list(zip(*MODELS))[0]
                for group in ['positive', 'negative']
            ],
            names=["model", "group"]
        )
    )
    )
    for label, data in analysis.items()
}

In [None]:
for label, data in dispersion.items():
    formatted: pd.DataFrame = (
        data
        [['mean', 'std', 'min', 'max']]
        .set_index(pd.Index(list(zip(*MODELS))[0]))
        .round(3)
    )
    formatted.to_csv(f'{label}.metric.dispersion.csv')
    display(formatted)