# This notebook supports the creation of overview tables for experiments on the Adult dataset.

Organize imports, set constants, and load result files.

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from ast import literal_eval

CONSTANT = 2
PATH = f'log{CONSTANT}/'

df_v = pd.read_csv(f'{PATH}results_voting.csv')
df_s = pd.read_csv(f'{PATH}results_student.csv')

df_vb = pd.read_csv(f'{PATH}baseline/results_voting.csv')
df_sb = pd.read_csv(f'{PATH}baseline/results_student.csv')

In [2]:
print('Statistics on voting accuracies (in %):')

va_min = round(np.min(df_v['accuracy']) * 100, 2)
va_max = round(np.max(df_v['accuracy']) * 100, 2)
va_avg = round(np.mean(df_v['accuracy']) * 100, 2)
va_std = round(np.std(df_v['accuracy']) * 100, 2)

bva_min = round(np.min(df_vb['accuracy']) * 100, 2)
bva_max = round(np.max(df_vb['accuracy']) * 100, 2)
bva_avg = round(np.mean(df_vb['accuracy']) * 100, 2)
bva_std = round(np.std(df_vb['accuracy']) * 100, 2)

print('min:', va_min)
print('max:', va_max)
print('avg:', va_avg)
print('std:', va_std)
print('\nbaselines:')
print('min:', bva_min)
print('max:', bva_max)
print('avg:', bva_avg)
print('std:', bva_std)

Statistics on voting accuracies (in %):
min: 84.5
max: 87.6
avg: 86.16
std: 0.6

baselines:
min: 84.9
max: 87.5
avg: 86.18
std: 0.61


In [3]:
df = df_v

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({label: (1 - d, d) for label in [0, 1]}))]
        u = round(np.mean(data[data['collector'] == 'uGNMax']['accuracy']) * 100, 2)
        v = round(np.mean(data[data['collector'] == 'vGNMax']['accuracy']) * 100, 2)
        w = round(np.mean(data[data['collector'] == 'wGNMax']['accuracy']) * 100, 2)
        res[e].append((u, v, w))
        
print('Table of average voting accuracy per personalization.\n')
print('average of GNMax:', round(np.mean(df_vb['accuracy']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average voting accuracy per personalization.

average of GNMax: 86.18


Unnamed: 0,0.25,0.50,0.75
2,"(86.19, 86.15, 86.15)","(86.17, 86.16, 86.19)","(86.13, 86.17, 86.16)"
4,"(86.17, 86.11, 86.15)","(86.17, 86.15, 86.18)","(86.17, 86.16, 86.17)"
8,"(86.2, 86.15, 86.16)","(86.18, 86.16, 86.18)","(86.15, 86.17, 86.17)"


In [4]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({label: (1 - d, d) for label in [0, 1]}))]
        u = round(np.mean(data[data['collector'] == 'uGNMax']['n_labels']), 2)
        v = round(np.mean(data[data['collector'] == 'vGNMax']['n_labels']), 2)
        w = round(np.mean(data[data['collector'] == 'wGNMax']['n_labels']), 2)
        res[e].append((u, v, w))

print('Table of average number of produced labels per personalization.\n')
for e in [1, 2, 4, 8]:
    x = math.log(CONSTANT * 1.0 * e)
    print(f'GNMax with budget {CONSTANT} * {e}:',
          round(np.mean(df_sb[df_sb['limit'] == f'({x},)']['n_labels']), 2))
pd.DataFrame(res, index=distributions).T

Table of average number of produced labels per personalization.

GNMax with budget 2 * 1: 88.46
GNMax with budget 2 * 2: 354.04
GNMax with budget 2 * 4: 763.08
GNMax with budget 2 * 8: 1287.86


Unnamed: 0,0.25,0.50,0.75
2,"(140.24, 15.52, 139.36)","(202.38, 57.16, 203.28)","(272.4, 132.94, 272.52)"
4,"(197.82, 9.54, 198.02)","(345.68, 74.68, 348.74)","(541.46, 236.18, 542.52)"
8,"(263.64, 7.66, 259.16)","(529.56, 95.4, 530.46)","(868.1, 399.72, 871.62)"


In [5]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({label: (1 - d, d) for label in [0, 1]}))]
        u = round(np.mean(data[data['collector'] == 'uGNMax']['test_accuracy']) * 100, 2)
        v = round(np.mean(data[data['collector'] == 'vGNMax']['test_accuracy']) * 100, 2)
        w = round(np.mean(data[data['collector'] == 'wGNMax']['test_accuracy']) * 100, 2)
        res[e].append((u, v, w))
        
print('Table of average accuracy (in %) per personalization.\n')
for e in [1, 2, 4, 8]:
    x = math.log(CONSTANT * 1.0 * e)
    print(f'GNMax with budget {CONSTANT} * {e}:',
          round(np.mean(df_sb[df_sb['limit'] == f'({x},)']['test_accuracy']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average accuracy (in %) per personalization.

GNMax with budget 2 * 1: 79.85
GNMax with budget 2 * 2: 82.52
GNMax with budget 2 * 4: 83.02
GNMax with budget 2 * 8: 83.23


Unnamed: 0,0.25,0.50,0.75
2,"(81.02, 76.17, 80.87)","(81.76, 78.62, 81.76)","(82.16, 80.7, 82.26)"
4,"(81.79, 75.73, 81.67)","(82.52, 79.32, 82.6)","(82.87, 82.09, 82.89)"
8,"(82.3, 75.31, 82.25)","(82.82, 79.97, 82.84)","(83.07, 82.72, 83.04)"


In [6]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({label: (1 - d, d) for label in [0, 1]}))]
        t = data['avg_budget'].iloc[0]
        res[e].append(round(np.mean(df_sb[df_sb['limit'] == str((t,))]['test_accuracy']) * 100, 2))

print('Table of average accuracy for non-personalized GNMax with average budget per personalization.\n')
pd.DataFrame(res, index=distributions).T

Table of average accuracy for non-personalized GNMax with average budget per personalization.



Unnamed: 0,0.25,0.50,0.75
2,81.18,82.0,82.36
4,82.36,82.81,83.01
8,82.84,83.1,83.19


In [7]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({label: (1 - d, d) for label in [0, 1]}))]
        t = data['avg_budget'].iloc[0]
        res[e].append(round(np.mean(df_sb[df_sb['limit'] == str((t,))]['n_labels']), 2))

print('Table of average number of labels for non-personalized GNMax with average budget per personalization.\n')
pd.DataFrame(res, index=distributions).T

Table of average number of labels for non-personalized GNMax with average budget per personalization.



Unnamed: 0,0.25,0.50,0.75
2,157.94,225.34,290.64
4,290.64,474.38,627.66
8,527.58,842.54,1089.88
