In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
from itertools import product
import wandb
import numpy as np
import pandas as pd

In [3]:
wandb_user = 'sisaman'
wandb_project = 'GAP'

In [4]:
class RunFactory:
    def __init__(self, entity, project, check_existing=True):
        self.project = project
        self.check_existing = check_existing
        self.cmd_list = []

        if check_existing:
            api = wandb.Api()
            projects = [project.name for project in api.projects(entity=entity)]
            if wandb_project not in projects:
                self.runs_df = pd.DataFrame()
            else:
                runs = api.runs(f"{entity}/{project}", per_page=2000)
                config_list = []
                for run in runs:
                    config_list.append({k: v for k,v in run.config.items() if not k.startswith('_')})

                self.runs_df = pd.DataFrame.from_dict(config_list)
                if 'epsilon' in self.runs_df.columns:
                    self.runs_df['epsilon'] = self.runs_df['epsilon'].astype(float)
        
    
    def register(self, method: str, **params) -> list[str]:
        for key, value in params.items():
            if not (isinstance(value, list) or isinstance(value, tuple)):
                params[key] = (value,)
        
        cmd_list = []
        configs = self.product_dict(params)

        for config in configs:
            if not self.check_existing or len(self.find_runs(config)) == 0:
                self.runs_df = pd.concat([self.runs_df, pd.DataFrame(config, index=[0])], ignore_index=True)
                config['project'] = self.project
                options = ' '.join([f' --{param} {value} ' for param, value in config.items()])
                command = f'python train.py {method} {options}'
                command = ' '.join(command.split())
                cmd_list.append(command)

        self.cmd_list += cmd_list
        return cmd_list

    def get_all_runs(self) -> list[str]:
        return self.cmd_list

    def save(self, path: str):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'w') as file:
            for item in self.cmd_list:
                print(item, file=file)
        print(f'Saved {len(self.cmd_list)} commands to {path}')

    def find_runs(self, config: dict) -> pd.DataFrame:
        if set(config.keys()) - set(self.runs_df.columns):
            # if config has a key not in runs_df, return an empty df
            return pd.DataFrame()
        else:
            # return a df with rows corresponding to runs that match config
            return self.runs_df.loc[np.all([self.runs_df[k] == v for k, v in config.items()], axis=0), :]

    @staticmethod
    def product_dict(params):
        keys = params.keys()
        vals = params.values()
        for instance in product(*vals):
            yield dict(zip(keys, instance))


run_factory = RunFactory(entity=wandb_user, project=wandb_project, check_existing=True)

# Experiments

### Hyper-parameters

In [5]:
datasets = ['facebook', 'reddit', 'amazon']
batch_size = {'facebook': 256, 'reddit': 2048, 'amazon': 4096}

gap_methods  = ['gap-inf', 'gap-edp', 'gap-ndp']
sage_methods = ['sage-inf', 'sage-edp', 'sage-ndp']
mlp_methods  = ['mlp', 'mlp-dp']
inf_methods  = ['gap-inf', 'sage-inf']
edp_methods  = ['gap-edp', 'sage-edp', 'mlp']
ndp_methods  = ['gap-ndp', 'sage-ndp', 'mlp-dp']
all_methods  = inf_methods + edp_methods + ndp_methods

hparams = {dataset: {method: {} for method in all_methods} for dataset in datasets}

for dataset in datasets:
    # For GAP methods
    for method in gap_methods:
        hparams[dataset][method]['encoder_layers'] = 2
        hparams[dataset][method]['base_layers'] = 1
        hparams[dataset][method]['head_layers'] = 1
        hparams[dataset][method]['combine'] = 'cat'
        hparams[dataset][method]['hops'] = [1, 2, 3, 4, 5]

    # For SAGE methods
    for method in sage_methods:
        hparams[dataset][method]['base_layers'] = 2
        hparams[dataset][method]['head_layers'] = 1
        if method != 'sage-ndp':
            hparams[dataset][method]['mp_layers'] = [1, 2, 3, 4, 5]
        
    # For MLP methods
    for method in mlp_methods:
        hparams[dataset][method]['num_layers'] = 3

    # For GAP-NDP and SAGE-NDP
    for method in ['gap-ndp', 'sage-ndp']:
        hparams[dataset][method]['max_degree'] = [100, 200, 300, 400]

    # For all methods
    for method in all_methods:
        hparams[dataset][method]['hidden_dim'] = 16
        hparams[dataset][method]['activation'] = 'selu'
        hparams[dataset][method]['optimizer'] = 'adam'
        hparams[dataset][method]['learning_rate'] = 0.01
        hparams[dataset][method]['repeats'] = 1                                           # FIXME: change this to 10 after sanity check
        if method in ndp_methods:
            hparams[dataset][method]['max_grad_norm'] = 1
            hparams[dataset][method]['epochs'] = 10
            hparams[dataset][method]['batch_size'] = batch_size[dataset]
        else:
            hparams[dataset][method]['batch_norm'] = True
            hparams[dataset][method]['epochs'] = 100
            hparams[dataset][method]['batch_size'] = 'full'


    # For GAP methods
    for method in gap_methods:
        hparams[dataset][method]['encoder_epochs'] = hparams[dataset][method]['epochs']


### Accuracy/Privacy Trade-off

In [6]:
for dataset in datasets:
    for method in all_methods:
        params = {}
        if method in ndp_methods:
            params['epsilon'] = [1, 2, 3, 4, 5, 10, 15, 20, 25]
        elif method in edp_methods:
            params['epsilon'] = [0.2, 0.4, 0.6, 0.8, 1, 3, 5, 7, 9]
            
        run_factory.register(
            method=method, 
            dataset=dataset,
            **params, 
            **hparams[dataset][method]
        )

print(len(run_factory.get_all_runs()))

1002


### Effect of Encoder Module

In [7]:
for dataset in datasets:
    for method in ['gap-edp', 'gap-ndp']:
        hp = {**hparams[dataset][method]}
        default_encoder_layers = hp.pop('encoder_layers')
        epsilon = [1,3,5,7,9] if method == 'gap-edp' else [5,10,15,20,25]
        run_factory.register(
            method=method,
            dataset=dataset,
            encoder_layers=[0, default_encoder_layers],
            epsilon=epsilon,
            **hp
        )

print(len(run_factory.get_all_runs()))

1377


### Effect of Hops

In [8]:
for dataset in datasets:
    for method in ['gap-edp', 'gap-ndp']:
        hp = {**hparams[dataset][method]}
        hp.pop('hops')
        hops = [1,2,3,4,5]
        epsilon = [1,3,7,9] if method == 'gap-edp' else [5,10,20,25]
        run_factory.register(
            method=method,
            dataset=dataset,
            hops=hops,
            epsilon=epsilon,
            **hp
        )

print(len(run_factory.get_all_runs()))

1377


### Effect of Degree

In [9]:
for dataset in datasets:
    method = 'gap-ndp'
    hp = {**hparams[dataset][method]}
    hp.pop('max_degree')
    max_degree = [10,20,50,100,200,300,400]
    epsilon = [5,10,20,25]
    run_factory.register(
        method=method,
        dataset=dataset,
        max_degree=max_degree,
        epsilon=epsilon,
        **hp
    )

print(len(run_factory.get_all_runs()))

1557


In [10]:
run_factory.save(path='jobs/experiments.sh')

Saved 1557 commands to jobs/experiments.sh
