# A100 Parameter Sweep

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns

In [None]:
def parse_file(filename):
    total_time = np.nan
    warmup_time = np.nan
    mean_time = np.nan
    total_steps = np.nan
    test_metric = np.nan
    loss = np.array([])
    with open(filename, 'r') as f:
        for line in f.readlines():
            if line.startswith('total time:'):
                total_time = float(line.split(':')[1].split('s')[0].strip())
            elif line.startswith('warmup time:'):
                warmup_time = float(line.split(':')[1].split('s')[0].strip())
            elif line.startswith('mean post time:'):
                mean_time = float(line.split(':')[1].split('s')[0].strip())
            elif line.startswith('total steps:'):
                total_steps = int(line.split(':')[1].strip())
            elif line.startswith('test'):
                test_metric = float(line.split(':')[1].strip())
            elif line.startswith('loss'):
                loss_arr = line.split(':')[1].strip().split(',')
                loss = np.array([float(_) for _ in loss_arr])
    record = {'total time':total_time, 'warmup time':warmup_time,
              'mean time':mean_time, 'total steps':total_steps,
              'score':test_metric}
    return record, loss

In [None]:
def parse_records(dataset, algorithms, max_degrees, batch_sizes, agg_dims, sample_sizes):
    records = []
    epochs = []
    losses = []
    algs = []
    ms = []
    bs = []
    ads = []
    ss = []
    for a in algorithms:
        for m in max_degrees:
            for b in batch_sizes:
                for ad in agg_dims:
                    for s in sample_sizes:
                        filename = '../output/%s_a_%s_b_%d_ad_%d_m_%d_s_%d.out.txt'%(
                            dataset, a, b, ad, m, s)
                        record, loss = parse_file(filename)
                        record['algorithm'] = a
                        record['max degree'] = m
                        record['batch size'] = b
                        record['agg dim'] = ad
                        record['sample size'] = s
                        records.append(record)
                        losses.append(loss)
                        epochs.append(np.arange(len(losses[-1])))
                        algs.append(np.repeat(a, len(losses[-1])))
                        ms.append(np.repeat(m, len(losses[-1])))
                        bs.append(np.repeat(b, len(losses[-1])))
                        ads.append(np.repeat(ad, len(losses[-1])))
                        ss.append(np.repeat(s, len(losses[-1])))
    losses = np.log(np.concatenate(losses))
    epochs = np.concatenate(epochs)
    algs = np.concatenate(algs)
    ms = np.concatenate(ms)
    bs = np.concatenate(bs)
    ads = np.concatenate(ads)
    ss = np.concatenate(ss)
    ldf = pd.DataFrame({'epoch':epochs, 'loss':losses,
                        'max degree':ms, 'algorithm':algs,
                        'batch size':bs, 'agg dim':ads,
                        'sample size':ss})
    rdf = pd.DataFrame(records)

    return ldf, rdf

In [None]:
def plot_loss_curves(ldf, title, hue='algorithm', style='max degree'):
    gr = (np.sqrt(5)-1)/2  # golden ratio

    fig, ax = plt.subplots(1, 1, figsize=(8, gr*8))
    ax = sns.lineplot(x='epoch', y='loss', hue=hue, 
                      style=style, data=ldf, 
                      palette=sns.color_palette(
                          'hls', len(ldf[hue].unique())))
    plt.xlabel('epoch', fontsize=14)
    plt.ylabel('log loss', fontsize=14)
    plt.title(title, fontsize=16)
    ax.tick_params(labelsize=14)
    ax.legend(bbox_to_anchor=(1.01,1), fontsize=14)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.show()

In [None]:
def plot_epoch_bars(rdf, title, x='max degree', hue='algorithm'):
    gr = (np.sqrt(5)-1)/2 # golden ratio

    fig, ax = plt.subplots(1, 1, figsize=(8, gr*8))
    ax = sns.barplot(x=x, y='mean time', hue=hue, 
                     data=rdf,  palette=sns.color_palette(
                         'hls', len(ldf[hue].unique())))
    plt.xlabel(x, fontsize=14)
    plt.ylabel('time (s)', fontsize=14)
    plt.title(title, fontsize=16)
    ax.tick_params(labelsize=14)
    # for some reason ax.legend() isn't working here
    plt.setp(ax.get_legend().get_texts(), fontsize='14')
    plt.setp(ax.get_legend().get_title(), fontsize='14')
    plt.show()

### BTER (Unsup) - Batch Size/Agg Dim
```
n=1
d='bter'
p='gpu'
a='dense'
bs=(128 512 1024)
e=5
ads=(256 512 1024)
de=2
m=100
nss=5
s=15
do=.5
```

In [None]:
dataset = 'bter'
algorithms = ['dense']
batch_sizes = [128, 512, 1024]
agg_dims = [256, 512, 1024]
max_degrees = [100]
sample_sizes = [15]
ldf, rdf = parse_records(dataset, algorithms, max_degrees, batch_sizes, agg_dims, sample_sizes)

In [None]:
plot_loss_curves(ldf, 'BTER - Dense, Batch/Agg Sweep (A100)', hue='batch size', style='agg dim')

In [None]:
plot_epoch_bars(rdf, 'BTER - Dense, Batch/Agg Sweep (A100)', x='batch size', hue='agg dim')

### REDDIT (Unsup) - Batch Size/Agg Dim
```
n=1
d='lreddit'
p='gpu'
a='dense'
bs=(256 512 1024)
e=5
ads=(128 512 1024)
de=2
m=128
nss=5
s=15
do=.5
pt=.01
```

In [None]:
dataset = 'lreddit'
algorithms = ['dense']
batch_sizes = [256, 512, 1024]
agg_dims = [128, 512, 1024]
max_degrees = [128]
sample_sizes = [15]
ldf, rdf = parse_records(dataset, algorithms, max_degrees, batch_sizes, agg_dims, sample_sizes)

In [None]:
plot_loss_curves(ldf, 'REDDIT (Unsup) - Dense, Batch/Agg Sweep (A100)', hue='batch size', style='agg dim')

In [None]:
plot_epoch_bars(rdf, 'REDDIT (Unsup) - Dense, Batch/Agg Sweep (A100)', x='batch size', hue='agg dim')

### OGB-ARXIV (Sup) - Batch Size/Agg Dim
```
n=1
d='arxiv'
p='gpu'
a='dense'
bs=(128 512 1024)
e=200
ads=(256 512 1024)
de=2
m=100
s=15
pa=20
do=.5
```

In [None]:
dataset = 'arxiv'
algorithms = ['dense']
batch_sizes = [128, 512, 1024]
agg_dims = [256, 512, 1024]
max_degrees = [100]
sample_sizes = [15]
ldf, rdf = parse_records(dataset, algorithms, max_degrees, batch_sizes, agg_dims, sample_sizes)

In [None]:
plot_loss_curves(ldf, 'ARXIV - Dense, Batch/Agg Sweep (A100)', hue='batch size', style='agg dim')

In [None]:
plot_epoch_bars(rdf, 'ASRXIV - Dense, Batch/Agg Sweep (A100)', x='batch size', hue='agg dim')

### Reddit (sup) - Batch Size/Agg Dim
```
n=1
d='nreddit'
p='gpu'
a='dense'
bs=(256 512 1024)
e=200
ads=(128 512 1024)
de=2
m=128
s=15
pa=20
do=.5
```

In [None]:
dataset = 'nreddit'
algorithms = ['dense']
batch_sizes = [256, 512, 1024]
agg_dims = [128, 512, 1024]
max_degrees = [128]
sample_sizes = [15]
ldf, rdf = parse_records(dataset, algorithms, max_degrees, batch_sizes, agg_dims, sample_sizes)

In [None]:
plot_loss_curves(ldf, 'REDDIT (Sup) - Dense, Batch/Agg Sweep (A100)', hue='batch size', style='agg dim')

In [None]:
plot_epoch_bars(rdf, 'REDDIT (Sup) - Dense, Batch/Agg Sweep (A100)', x='batch size', hue='agg dim')