In [1]:
import os
import json
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

In [5]:
data = {'gan-textgen-bert': {}, 'bert': {}, 'gan-textgen-transformer': {}}

def load_stats(stats_dir: str) -> dict:
    stats = {}
    for filename in os.listdir(stats_dir):
        stats_file = os.path.join(stats_dir, filename)
        with open(stats_file, 'r') as f:
            stats.update(json.load(f))
    return stats

data['gan-textgen-bert'].update(load_stats('../core/stats/llmsubj/samples-20_naug-1/gan-textgen-bert'))
data['bert'].update(load_stats('../core/stats/llmsubj/samples-20_naug-1/bert'))
data['gan-textgen-transformer'].update(load_stats('../core/stats/llmsubj/samples-20_naug-1/gan-textgen-transformer'))

In [6]:
def calculate_interval(samples: list[float], confidence_level: float = 0.95) -> tuple[float, float]:
    sample_mean = np.mean(samples)
    sample_std = np.std(samples, ddof=1)
    sample_size = len(samples)

    t_score = stats.t.ppf((1 + confidence_level) / 2, df=sample_size - 1)
    margin_of_error = t_score * (sample_std / np.sqrt(sample_size))

    return sample_mean, margin_of_error

In [7]:
datasets = list(data['bert'].keys())
datasets.sort()
for dataset in datasets:
    print('Dataset ' + dataset)
    gan_textgen_stats = data['gan-textgen-bert'][dataset]
    accs = [x['accuracy'] for x in gan_textgen_stats]
    avg, err = calculate_interval(accs)
    print('gan-textgen-bert acc avg: {:.3f} err: {:.3f}'.format(avg, err))
    bert_stats = data['bert'][dataset]
    accs = [x['accuracy'] for x in bert_stats]
    avg, err = calculate_interval(accs)
    print('bert acc avg: {:.3f} err: {:.3f}'.format(avg, err))
    transformers_stats = data['gan-textgen-transformer'][dataset]
    accs = [x['accuracy'] for x in transformers_stats]
    avg, err = calculate_interval(accs)
    print('gan-textgen-transformer acc avg: {:.3f} err: {:.3f}'.format(avg, err))
    print()

Dataset cllmsubj_001
gan-textgen-bert acc avg: 0.881 err: 0.007
bert acc avg: 0.833 err: 0.083
gan-textgen-transformer acc avg: 0.865 err: 0.018

Dataset cllmsubj_002
gan-textgen-bert acc avg: 0.906 err: 0.006
bert acc avg: 0.892 err: 0.011
gan-textgen-transformer acc avg: 0.906 err: 0.015

Dataset cllmsubj_005
gan-textgen-bert acc avg: 0.910 err: 0.007
bert acc avg: 0.888 err: 0.045
gan-textgen-transformer acc avg: 0.914 err: 0.004

Dataset subj_001
gan-textgen-bert acc avg: 0.866 err: 0.015
bert acc avg: 0.851 err: 0.021
gan-textgen-transformer acc avg: 0.878 err: 0.010

Dataset subj_002
gan-textgen-bert acc avg: 0.898 err: 0.007
bert acc avg: 0.894 err: 0.011
gan-textgen-transformer acc avg: 0.897 err: 0.006

Dataset subj_005
gan-textgen-bert acc avg: 0.914 err: 0.002
bert acc avg: 0.889 err: 0.021
gan-textgen-transformer acc avg: 0.911 err: 0.005

