In [None]:
import glob
import json
import typing

import langdetect
import pandas as pd
import seaborn as sns
import matplotlib as mpl

In [None]:
import config

CFG = config.Config()

In [None]:
raw_samples: typing.List[dict] = [
    json.load(open(file, 'r'))
    for file in glob.glob(f'{CFG.data_raw_dir}/*.json')
]
(
    open(f'{CFG.data_dir}/{CFG.dataset_name}.json', 'w')
    .write(json.dumps(raw_samples, indent=4, ensure_ascii=False))
)
len(raw_samples)

In [None]:
dataset: pd.DataFrame = (
    pd.json_normalize(raw_samples)
    .astype({'model': 'category'})
    .pipe(lambda _df: _df.assign(model=_df['model'].str.replace(':.*','', regex=True)))
)
dataset.to_parquet(f'{CFG.data_dir}/{CFG.dataset_name}.parquet')
dataset

In [None]:
dataset['model'].value_counts()

In [None]:
dataset['lang'] = dataset.apply(lambda row: langdetect.detect(row.response) if len(row.response) > 1 else None, axis=1)

In [None]:
sns.heatmap(
    (
        dataset
        .groupby('model')
        ['lang']
        .value_counts(normalize=True)
        .round(3)
        .to_frame()
        .pipe(lambda _df: _df[_df['proportion'] > 0.001])
        .reset_index()
        .pivot(index="model", columns="lang", values="proportion")
        .T
    ),
    annot=True,
    fmt='g',
    linewidth=.5,
)
mpl.pyplot.savefig(f'{CFG.report_dir}/plot.heat.model.lang.pdf', format='pdf')

In [None]:
german_subset: pd.DataFrame = dataset[dataset.lang == CFG.lang]
german_subset.to_csv(f'{CFG.data_dir}/{CFG.dataset_name}.{CFG.lang}.csv')
german_subset.to_parquet(f'{CFG.data_dir}/{CFG.dataset_name}.{CFG.lang}.parquet')

In [None]:
german_subset_stratified: pd.DataFrame = (
    german_subset.groupby('model', observed=False)
    .sample(n=german_subset['model'].value_counts().min())
)
german_subset_stratified.to_csv(f'{CFG.data_dir}/{CFG.dataset_name}.{CFG.lang}.strat.csv')
german_subset_stratified.to_parquet(f'{CFG.data_dir}/{CFG.dataset_name}.{CFG.lang}.strat.parquet')