# This notebook supports the creation of overview tables for skew experiments on the Adult dataset.

Organize imports, set constants, and load result files.

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from ast import literal_eval

CONSTANT = 2
PATH = f'log{CONSTANT}/'

df_t = pd.read_csv(f'{PATH}results_teachers.csv')
df_v = pd.read_csv(f'{PATH}results_voting.csv')
df_s = pd.read_csv(f'{PATH}results_student.csv')

df_tb = pd.read_csv(f'{PATH}baseline/results_teachers.csv')
df_vb = pd.read_csv(f'{PATH}baseline/results_voting.csv')
df_sb = pd.read_csv(f'{PATH}baseline/results_student.csv')

Set aliases for PATE variants.

In [2]:
kinds = {
    'GNMax': 'non-personalized',
    'uGNMax': 'upsampling',
    'vGNMax': 'vanishing',
    'wGNMax': 'weighting',
}

In [3]:
df = df_v

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75, 1]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = round(np.mean(data['accuracy']) * 100, 2)
        res[e].append(t)
        
print('Table of average voting accuracy per personalization.\n')
print('average of GNMax:', round(np.mean(df_vb['accuracy']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average voting accuracy per personalization.

average of GNMax: 86.18


Unnamed: 0,0.25,0.50,0.75,1.00
2,86.68,86.61,86.19,85.64
4,86.6,85.64,84.12,82.57
8,86.25,84.12,81.82,79.52


In [4]:
df = df_s

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = round(np.mean(data['n_labels']), 2)
        res[e].append(t)

print('Table of average number of produced labels per personalization.\n')
for e in [1, 2, 4, 8]:
    x = math.log(CONSTANT * 1.0 * e)
    print(f'GNMax with budget {CONSTANT} * {e}:',
          round(np.mean(df_sb[df_sb['limit'] == f'({x},)']['n_labels']), 2))
pd.DataFrame(res, index=distributions).T

Table of average number of produced labels per personalization.

GNMax with budget 2 * 1: 88.46
GNMax with budget 2 * 2: 354.04
GNMax with budget 2 * 4: 763.08
GNMax with budget 2 * 8: 1287.86


Unnamed: 0,0.25,0.50,0.75,1.00
2,89.66,95.24,101.04,108.98
4,93.32,107.88,132.16,162.26
8,95.68,128.6,171.64,224.6


In [5]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75, 1]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = round(np.mean(data['test_accuracy']) * 100, 2)
        res[e].append(t)
        
print('Table of average student accuracy (in %) per personalization.\n')
for e in [1, 2, 4, 8]:
    x = math.log(CONSTANT * 1.0 * e)
    print(f'GNMax with budget {CONSTANT} * {e}:',
          round(np.mean(df_sb[df_sb['limit'] == f'({x},)']['test_accuracy']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average student accuracy (in %) per personalization.

GNMax with budget 2 * 1: 79.85
GNMax with budget 2 * 2: 82.52
GNMax with budget 2 * 4: 83.02
GNMax with budget 2 * 8: 83.23


Unnamed: 0,0.25,0.50,0.75,1.00
2,81.26,81.41,81.35,80.57
4,81.51,80.72,78.6,77.78
8,81.39,78.74,77.36,76.24


In [6]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75, 1]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = data['avg_budget'].iloc[0]
        res[e].append(round(np.mean(df_sb[df_sb['limit'] == str((t,))]['test_accuracy']) * 100, 2))

print('Table of average student accuracy for non-personalized GNMax with average budget per personalization.\n')
pd.DataFrame(res, index=distributions).T

Table of average student accuracy for non-personalized GNMax with average budget per personalization.



Unnamed: 0,0.25,0.50,0.75,1.00
2,80.34,80.71,80.91,81.22
4,80.91,81.8,82.13,82.36
8,81.87,82.48,82.65,82.91


In [7]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75, 1]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = data['avg_budget'].iloc[0]
        res[e].append(round(np.mean(df_sb[df_sb['limit'] == str((t,))]['n_labels']), 2))

print('Table of average number of labels for non-personalized GNMax with average budget per personalization.\n')
pd.DataFrame(res, index=distributions).T

Table of average number of labels for non-personalized GNMax with average budget per personalization.



Unnamed: 0,0.25,0.50,0.75,1.00
2,105.58,122.54,140.4,157.44
4,140.4,193.16,241.4,289.24
8,209.26,320.2,427.84,523.82


In [8]:
df = df_t

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str(tuple([CONSTANT * 1.0, CONSTANT * 1.0 * e]))) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = round(np.mean(data['precision']) * 100, 2)
        res[e].append(t)

print('Table of average teacher precision (in %) per personalization.\n')
print(f'GNMax:',
      round(np.mean(df_tb['precision']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average teacher precision (in %) per personalization.

GNMax: 69.41


Unnamed: 0,0.25,0.50,0.75,1.00
2,65.92,63.12,60.85,58.95
4,63.14,58.94,55.9,53.53
8,60.81,55.84,52.53,49.94


In [9]:
df = df_t

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str(tuple([CONSTANT * 1.0, CONSTANT * 1.0 * e]))) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = round(np.mean(data['recall']) * 100, 2)
        res[e].append(t)

print('Table of average teacher recall (in %) per personalization.\n')
print(f'GNMax:',
      round(np.mean(df_tb['recall']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average teacher recall (in %) per personalization.

GNMax: 48.82


Unnamed: 0,0.25,0.50,0.75,1.00
2,56.05,61.7,66.24,69.68
4,61.73,69.84,75.46,79.44
8,66.12,75.48,81.01,84.79


In [10]:
df = df_t

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str(tuple([CONSTANT * 1.0, CONSTANT * 1.0 * e]))) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = round(np.mean(data['f1_score']) * 100, 2)
        res[e].append(t)

print('Table of average teacher f1_score (in %) per personalization.\n')
print(f'GNMax:',
      round(np.mean(df_tb['f1_score']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average teacher f1_score (in %) per personalization.

GNMax: 56.52


Unnamed: 0,0.25,0.50,0.75,1.00
2,59.94,61.89,63.0,63.48
4,61.9,63.55,63.94,63.72
8,62.92,63.9,63.52,62.68


In [11]:
df = df_s

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = round(np.mean(data['precision']) * 100, 2)
        res[e].append(t)

print('Table of average student precision (in %) per personalization.\n')
for e in [1, 2, 4, 8]:
    x = math.log(CONSTANT * 1.0 * e)
    print(f'GNMax with budget {CONSTANT} * {e}:',
          round(np.mean(df_sb[df_sb['limit'] == f'({x},)']['precision']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average student precision (in %) per personalization.

GNMax with budget 2 * 1: 82.2
GNMax with budget 2 * 2: 79.96
GNMax with budget 2 * 4: 80.11
GNMax with budget 2 * 8: 80.35


Unnamed: 0,0.25,0.50,0.75,1.00
2,76.14,70.26,65.68,61.07
4,70.24,61.87,55.59,53.8
8,66.17,55.87,53.05,51.39


In [12]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75, 1]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = data['avg_budget'].iloc[0]
        res[e].append(round(np.mean(df_sb[df_sb['limit'] == str((t,))]['precision']) * 100, 2))

print('Table of average student precision (in %) for non-personalized GNMax with average budget per personalization.\n')
pd.DataFrame(res, index=distributions).T

Table of average student precision (in %) for non-personalized GNMax with average budget per personalization.



Unnamed: 0,0.25,0.50,0.75,1.00
2,81.76,82.13,80.52,80.4
4,80.52,80.74,79.81,79.99
8,80.32,80.09,80.12,79.97


In [13]:
df = df_s

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = round(np.mean(data['recall']) * 100, 2)
        res[e].append(t)

print('Table of average student recall (in %) per personalization.\n')
for e in [1, 2, 4, 8]:
    x = math.log(CONSTANT * 1.0 * e)
    print(f'GNMax with budget {CONSTANT} * {e}:',
          round(np.mean(df_sb[df_sb['limit'] == f'({x},)']['recall']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average student recall (in %) per personalization.

GNMax with budget 2 * 1: 24.78
GNMax with budget 2 * 2: 39.51
GNMax with budget 2 * 4: 42.02
GNMax with budget 2 * 8: 42.91


Unnamed: 0,0.25,0.50,0.75,1.00
2,36.77,45.93,54.74,63.39
4,45.91,62.25,72.61,77.36
8,54.0,72.68,79.42,83.59


In [14]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75, 1]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = data['avg_budget'].iloc[0]
        res[e].append(round(np.mean(df_sb[df_sb['limit'] == str((t,))]['recall']) * 100, 2))

print('Table of average student recall (in %) for non-personalized GNMax with average budget per personalization.\n')
pd.DataFrame(res, index=distributions).T

Table of average student recall (in %) for non-personalized GNMax with average budget per personalization.



Unnamed: 0,0.25,0.50,0.75,1.00
2,27.26,28.71,30.55,32.41
4,30.55,35.19,37.58,38.66
8,35.84,39.13,40.05,41.58


In [15]:
df = df_s

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = round(np.mean(data['f1_score']) * 100, 2)
        res[e].append(t)

print('Table of average student f1_score (in %) per personalization.\n')
for e in [1, 2, 4, 8]:
    x = math.log(CONSTANT * 1.0 * e)
    print(f'GNMax with budget {CONSTANT} * {e}:',
          round(np.mean(df_sb[df_sb['limit'] == f'({x},)']['f1_score']) * 100, 2))
pd.DataFrame(res, index=distributions).T

Table of average student f1_score (in %) per personalization.

GNMax with budget 2 * 1: 36.96
GNMax with budget 2 * 2: 52.7
GNMax with budget 2 * 4: 55.03
GNMax with budget 2 * 8: 55.86


Unnamed: 0,0.25,0.50,0.75,1.00
2,48.65,54.51,59.0,61.72
4,54.72,61.46,62.76,63.35
8,58.64,62.95,63.52,63.6


In [16]:
df = df_s

epsilons = [2, 4, 8]
distributions = [0.25, 0.5, 0.75, 1]

res = {
    2: [],
    4: [],
    8: [],
}
for e in epsilons:
    for d in distributions:
        data = df[(df['budgets_linear'] == str([CONSTANT * 1.0, CONSTANT * 1.0 * e])) &
                  (df['distribution'] == str({0: (1, 0), 1: (1 - d, d)}))]
        t = data['avg_budget'].iloc[0]
        res[e].append(round(np.mean(df_sb[df_sb['limit'] == str((t,))]['f1_score']) * 100, 2))

print('Table of average student f1_score (in %) for non-personalized GNMax with average budget per personalization.\n')
pd.DataFrame(res, index=distributions).T

Table of average student f1_score (in %) for non-personalized GNMax with average budget per personalization.



Unnamed: 0,0.25,0.50,0.75,1.00
2,40.08,41.79,43.67,45.54
4,43.67,48.64,50.76,51.94
8,49.24,52.4,53.29,54.59
