In [40]:
import os
os.chdir('../')

In [41]:
%matplotlib inline
#%matplotlib notebook

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
from copy import deepcopy
from typing import List, Tuple

from cycler import cycler
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.decomposition import PCA
import scipy.stats as stats
import torch
from torch import nn
import torch.nn.functional as F
import seml

import tqdm
tqdm.tqdm.pandas()
#plt.style.use('ggplot')

  from pandas import Panel


In [43]:
#from notebooks import mpl_latex

In [44]:
#mpl_latex.enable_production_mode()

In [45]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [46]:
labels_to_plot = [
    'Vanilla GCN',
    'Vanilla GDC',
    'SVD GCN',
    'Jaccard GCN',
    'RGCN',
    'Soft Medoid GDC (T=0.5)',
    #'Soft Median GDC (T=1.0)',
    #'Soft Median GDC (T=0.5)',
    'Soft Median GDC (T=0.2)'
]

In [47]:
dataset_map = {
    'cora_ml': r'\textbf{Cora ML}',#'~\citep{Bojchevski2018}}', # '\rotatebox{90}{Cora ML~\citep{Bojchevski2018}}',
    'citeseer': r'\textbf{Citeseer}',#~\citep{McCallum2000}}',
    'pubmed': r'\textbf{PubMed}',#~\citep{Sen2008}}',
    'ogbn-arxiv': r'\textbf{arXiv}',#~\citep{Hu2020}}',
    'ogbn-products': r'\textbf{Products}',#~\citep{Hu2020}}',
    'ogbn-papers100M': r'\textbf{Papers 100M}',#~\citep{Hu2020}}'
}
dataset_order = [dataset_map[k] for k in dataset_map.keys()]

In [48]:
attack_map = {
    'DICE': r'\textbf{DICE}',
    'GANG': r'\textbf{GANG (ours)}',
    'FGSM': r'\textbf{greedy FGSM}',
    'GreedyRBCD': r'\textbf{GR-BCD (ours)}',
    'PGD': r'\textbf{PGD}',
    'PRBCD': r'\textbf{PR-BCD (ours)}'
}
attack_order = [attack_map[k] for k in attack_map.keys()]

In [49]:
pm = r'\(\pm\)'
bpm = r'\(\boldsymbol{\pm}\)'

In [50]:
def make_max_bold(group):
    for col in list(group.columns):
        if not group[col].isna().all():
            idx = np.where(np.max(group[col].fillna("")) == group[col].fillna(""))[0]
            group.loc[group.index[idx], col] = rf'\textbf{{{group.loc[group.index[idx], col].iloc[0]}}}'\
                .replace(pm, bpm)
    return group

In [51]:
def make_second_best_underlined(group):
    for col in list(group.columns):
        if not group[col].isna().all() and np.unique(group[col].fillna("")).shape[0] > 1:
            second_max = np.unique(group[col].fillna("").apply(
                lambda v: v.replace(r'\textbf{', '').replace('}', '')
            ))[-2]
            idx = np.where(second_max == group[col].fillna(""))[0]
            group.loc[group.index[idx], col] = rf'\underline{{{group.loc[group.index[idx], col].iloc[0]}}}'
    return group

In [52]:
def mark_best_and_second_best(group: pd.DataFrame, 
                              first_mark: str = r'\textbf',
                              second_mark: str = r'\underline',
                              dimension: int = 0) -> pd.DataFrame:
    iterable  

In [53]:
def calc_mean_and_error(values: pd.Series, seeds: pd.Series, with_error=True, decimal_places: int = 3): 
    values, seeds = values.values, seeds.values
    seeds = seeds[~np.isnan(values)]
    values = values[~np.isnan(values)]

    idx = np.unique(seeds, return_index=True)[1]
    values = values[idx]
    
    if with_error:
        return rf'{np.mean(values):.{decimal_places}f} $\pm$ {np.std(values)/len(values):.{decimal_places}f}'
    else:
        return rf'{np.mean(values):.{decimal_places}f}'

In [54]:
from functools import partial

def _mark_best_and_second_best(vector: np.ndarray, 
                               first_mark: str = r'\textbf',
                               second_mark: str = r'\underline',
                               is_higher_better : bool = True) -> np.ndarray:
    vector = vector.astype(object)
    values = np.unique(vector[vector == vector])
    if not len(values):
        return vector
    
    if first_mark:
        if is_higher_better:
            mask = [vector == values[-1]]
        else:
            mask = [vector == values[0]]
        vector[mask] = np.char.add(np.char.add(first_mark + '{', vector[mask]), '}')
    if second_mark:
        if is_higher_better:
            mask = [vector == values[-2]]
        else:
            mask = [vector == values[1]]
        vector[mask] = np.char.add(np.char.add(second_mark + '{', vector[mask]), '}')
    return vector
    

def mark_best_and_second_best(df: pd.DataFrame, 
                              first_mark: str = r'\textbf',
                              second_mark: str = r'\underline',
                              axis: int = 0,
                              is_higher_better : bool = True) -> pd.DataFrame:
    df[:] = np.apply_along_axis(
        partial(_mark_best_and_second_best, first_mark=first_mark, 
                second_mark=second_mark, is_higher_better=is_higher_better),
        axis=axis,
        arr=df.values
    )
    return df

In [152]:
df_experiments = seml.get_results('kdd21_rgnn_at_scale_attack_evasion_transfer',
                                  to_data_frame=True,
                                  fields=['batch_id', 'slurm', 'config', 'result'])
df_experiments.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=338.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=338.0), HTML(value='')))




  parsed = pd.io.json.json_normalize(parsed, sep='.')


Unnamed: 0,_id,batch_id,slurm.experiments_per_job,slurm.sbatch_options.gres,slurm.sbatch_options.mem,slurm.sbatch_options.cpus-per-task,slurm.sbatch_options.time,slurm.sbatch_options.partition,slurm.sbatch_options.nodes,slurm.sbatch_options.job-name,slurm.sbatch_options.array,slurm.sbatch_options.output,slurm.array_id,slurm.task_id,config.overwrite,config.db_collection,config.dataset,config.attack,config.attack_params.loss_type,config.epsilons,config.surrogate_params.n_filters,config.surrogate_params.dropout,config.surrogate_params.train_params.lr,config.surrogate_params.train_params.weight_decay,config.surrogate_params.train_params.patience,config.surrogate_params.train_params.max_epochs,config.binary_attr,config.seed,config.artifact_dir,config.pert_adj_storage_type,config.pert_attr_storage_type,config.model_storage_type,config.device,config.display_steps,config.model_label,result.results,config.attack_params.do_synchronize,config.attack_params.keep_heuristic,config.attack_params.search_space_size,config.attack_params.edge_budget,config.attack_params.edge_step_size,config.attack_params.edge_with_random_reverse,config.attack_params.eps,config.attack_params.feature_dedicated_iterations,config.attack_params.stop_optimizing_if_label_flipped,config.attack_params.feature_greedy_opt,config.attack_params.feature_lr,config.surrogate_params.do_checkpoint,config.surrogate_params.n_chunks,config.attack_params.lr_factor,config.attack_params.epsilon
0,1,1,1,gpu:1,16G,4,0-08:00,gpu_large,1,rgnn_at_scale_attack_evasion_transfer_1,0-17,/nfs/homedirs/geisler/code/robust_gnns_at_scal...,6209384,0,1,kdd21_rgnn_at_scale_attack_evasion_transfer,ogbn-arxiv,GreedyRBCD,CE,"[0, 0.01, 0.05, 0.1, 0.25]",64,0.5,0.01,0.001,100,3000,False,0,cache,evasion_transfer_attack_adj,evasion_transfer_attack_attr,pretrained,0,10,,"[{'label': 'Vanilla GCN', 'epsilon': 0, 'accur...",,,,,,,,,,,,,,,
1,2,1,1,gpu:1,16G,4,0-08:00,gpu_large,1,rgnn_at_scale_attack_evasion_transfer_1,0-17,/nfs/homedirs/geisler/code/robust_gnns_at_scal...,6209384,1,2,kdd21_rgnn_at_scale_attack_evasion_transfer,ogbn-arxiv,GreedyRBCD,MCE,"[0, 0.01, 0.05, 0.1, 0.25]",64,0.5,0.01,0.001,100,3000,False,0,cache,evasion_transfer_attack_adj,evasion_transfer_attack_attr,pretrained,0,10,,"[{'label': 'Vanilla GCN', 'epsilon': 0, 'accur...",,,,,,,,,,,,,,,
2,3,1,1,gpu:1,16G,4,0-08:00,gpu_large,1,rgnn_at_scale_attack_evasion_transfer_1,0-17,/nfs/homedirs/geisler/code/robust_gnns_at_scal...,6209384,2,3,kdd21_rgnn_at_scale_attack_evasion_transfer,ogbn-arxiv,GreedyRBCD,CE,"[0, 0.01, 0.05, 0.1, 0.25]",64,0.5,0.01,0.001,100,3000,False,1,cache,evasion_transfer_attack_adj,evasion_transfer_attack_attr,pretrained,0,10,,"[{'label': 'Vanilla GCN', 'epsilon': 0, 'accur...",,,,,,,,,,,,,,,
3,4,1,1,gpu:1,16G,4,0-08:00,gpu_large,1,rgnn_at_scale_attack_evasion_transfer_1,0-17,/nfs/homedirs/geisler/code/robust_gnns_at_scal...,6209384,3,4,kdd21_rgnn_at_scale_attack_evasion_transfer,ogbn-arxiv,GreedyRBCD,MCE,"[0, 0.01, 0.05, 0.1, 0.25]",64,0.5,0.01,0.001,100,3000,False,1,cache,evasion_transfer_attack_adj,evasion_transfer_attack_attr,pretrained,0,10,,"[{'label': 'Vanilla GCN', 'epsilon': 0, 'accur...",,,,,,,,,,,,,,,
4,5,1,1,gpu:1,16G,4,0-08:00,gpu_large,1,rgnn_at_scale_attack_evasion_transfer_1,0-17,/nfs/homedirs/geisler/code/robust_gnns_at_scal...,6209384,4,5,kdd21_rgnn_at_scale_attack_evasion_transfer,ogbn-arxiv,GreedyRBCD,CE,"[0, 0.01, 0.05, 0.1, 0.25]",64,0.5,0.01,0.001,100,3000,False,5,cache,evasion_transfer_attack_adj,evasion_transfer_attack_attr,pretrained,0,10,,"[{'label': 'Vanilla GCN', 'epsilon': 0, 'accur...",,,,,,,,,,,,,,,


In [153]:
df_experiments.columns

Index(['_id', 'batch_id', 'slurm.experiments_per_job',
       'slurm.sbatch_options.gres', 'slurm.sbatch_options.mem',
       'slurm.sbatch_options.cpus-per-task', 'slurm.sbatch_options.time',
       'slurm.sbatch_options.partition', 'slurm.sbatch_options.nodes',
       'slurm.sbatch_options.job-name', 'slurm.sbatch_options.array',
       'slurm.sbatch_options.output', 'slurm.array_id', 'slurm.task_id',
       'config.overwrite', 'config.db_collection', 'config.dataset',
       'config.attack', 'config.attack_params.loss_type', 'config.epsilons',
       'config.surrogate_params.n_filters', 'config.surrogate_params.dropout',
       'config.surrogate_params.train_params.lr',
       'config.surrogate_params.train_params.weight_decay',
       'config.surrogate_params.train_params.patience',
       'config.surrogate_params.train_params.max_epochs', 'config.binary_attr',
       'config.seed', 'config.artifact_dir', 'config.pert_adj_storage_type',
       'config.pert_attr_storage_type', 'conf

In [154]:
df_experiments.shape

(338, 51)

In [155]:
df_results = [
    pd.DataFrame(r) 
    for r in df_experiments['result.results']
]
for df_result, (_, df_experiment) in zip(df_results, df_experiments.iterrows()):
    df_result['dataset'] = df_experiment['config.dataset']
    df_result['attack'] = df_experiment['config.attack']
    df_result['seed'] = df_experiment['config.seed']
    df_result['batch_id'] = df_experiment['batch_id']
    df_result['novel_loss'] = (
        (df_experiment['config.attack_params.stop_optimizing_if_label_flipped'] == True)
        | (df_experiment['config.attack_params.loss_type'] == 'MCE')
        | (df_experiment['config.attack_params.loss_type'] == 'tanhCW')
    )

df_results = pd.concat(df_results, ignore_index=True)
df_results = df_results.sort_values('batch_id')
df_results = df_results.drop_duplicates([
    c for c in df_results.columns if c != 'batch_id' and c != 'accuracy'
], keep='last')

df_results

Unnamed: 0,label,epsilon,accuracy,dataset,attack,seed,batch_id,novel_loss
2307,Vanilla GCN,0.10,0.600748,ogbn-products,GANG,0,3,False
2305,Vanilla GDC,0.05,0.699293,ogbn-products,GANG,0,3,False
2301,Vanilla GDC,0.01,0.705477,ogbn-products,GANG,0,3,False
2309,Vanilla GDC,0.10,0.694208,ogbn-products,GANG,0,3,False
2299,Vanilla GCN,0.01,0.712638,ogbn-products,GANG,0,3,False
...,...,...,...,...,...,...,...,...
11820,Vanilla GCN,0.00,0.827273,cora_ml,PGD,5,13,False
11821,Vanilla GCN,0.01,0.818182,cora_ml,PGD,5,13,False
11822,Vanilla GCN,0.05,0.791700,cora_ml,PGD,5,13,False
11824,Vanilla GCN,0.25,0.631621,cora_ml,PGD,5,13,False


In [181]:
df_results[(df_results['attack'] == 'PRBCD') 
           & (df_results['epsilon'] == 0.05) 
           & (df_results['label'] == 'Vanilla GCN')]

Unnamed: 0,label,epsilon,accuracy,dataset,attack,seed,batch_id,loss,novel_loss
2267,Vanilla GCN,0.05,0.637545,ogbn-products,PRBCD,1,3,CE,False
2235,Vanilla GCN,0.05,0.638041,ogbn-products,PRBCD,0,3,CE,False
5601,Vanilla GCN,0.05,0.756917,cora_ml,PRBCD,5,6,CE,False
5656,Vanilla GCN,0.05,0.705929,cora_ml,PRBCD,5,6,tanh Margin,True
5491,Vanilla GCN,0.05,0.768775,cora_ml,PRBCD,1,6,CE,False
5546,Vanilla GCN,0.05,0.727668,cora_ml,PRBCD,1,6,tanh Margin,True
5821,Vanilla GCN,0.05,0.671658,citeseer,PRBCD,1,6,CE,False
5711,Vanilla GCN,0.05,0.672193,citeseer,PRBCD,0,6,CE,False
5766,Vanilla GCN,0.05,0.619786,citeseer,PRBCD,0,6,tanh Margin,True
5381,Vanilla GCN,0.05,0.731621,cora_ml,PRBCD,0,6,CE,False


In [156]:
df_results.attack.unique()

array(['GANG', 'PRBCD', 'DICE', 'GreedyRBCD', 'PGD', 'FGSM'], dtype=object)

In [157]:
df_results.dataset.unique()

array(['ogbn-products', 'cora_ml', 'citeseer', 'ogbn-arxiv', 'pubmed'],
      dtype=object)

In [158]:
df_results[df_results.dataset == 'ogbn-arxiv'].label.unique()

array(['Vanilla GCN', 'Soft Median GDC (T=0.5)',
       'Soft Median GDC (T=5.0)', 'Soft Median GDC (T=50.0)',
       'Soft Medoid GDC (T=50.0)', 'Soft Medoid GDC (T=0.5)',
       'Vanilla GDC', 'Soft Medoid GDC (T=5.0)',
       'Soft Median GDC (T=1.0)', 'Soft Median GDC (T=0.2)'], dtype=object)

In [159]:
architecture_c = r'\textbf{Architecture}'
dataset_c = r'   '
attack_c = r'\textbf{Attack}'
epsilons_c = r'Frac. edges \(\boldsymbol{\epsilon}\)'

epsilons = [0.01, 0.05, 0.1, 0.25]
epsilon_marks = ['', r'\textit', r'\underline', r'\textbf']

In [160]:
def transform_label(label: str):
    return label.replace('\n', ' ').replace(' (T=0.5)','').replace(' (T=0.2)','')

In [180]:
df = []

for (dataset, attack, label, epsilon), df_group in df_results[
    (df_results['novel_loss']
     | (df_results['attack'] == 'DICE')
     | (df_results['attack'] == 'GANG'))
    & df_results['label'].isin(labels_to_plot)
].groupby(['dataset', 'attack', 'label', 'epsilon']):
    if len(df_group.seed.unique()) != 3:
        print(f'For {dataset}-{attack}-{epsilon}-{label} collected runs for seed {df_group.seed.tolist()}')

    accurcy = calc_mean_and_error(df_group.accuracy, df_group.seed, with_error=False)
    
    df.append({
        dataset_c: dataset_map[dataset],
        architecture_c: transform_label(label),
        attack_c: attack_map[attack],
        epsilons_c: epsilon,
        'accuracy': accurcy
    })
    
df = pd.DataFrame(df)

df = df[
    ~((df[attack_c] == attack_map['PGD']) & (df[dataset_c] == dataset_map['pubmed']))
    & ~((df[attack_c] == attack_map['FGSM']) & (df[dataset_c] == dataset_map['pubmed']))
    #& ~((df[architecture_c] == transform_label('Soft Medoid GDC (T=0.5)')) & (df[dataset_c] == dataset_map['pubmed']))
]

df

For ogbn-arxiv-DICE-0.0-Soft Median GDC (T=0.2) collected runs for seed [1, 5]
For ogbn-arxiv-DICE-0.01-Soft Median GDC (T=0.2) collected runs for seed [1, 5]
For ogbn-arxiv-DICE-0.05-Soft Median GDC (T=0.2) collected runs for seed [1, 5]
For ogbn-arxiv-DICE-0.1-Soft Median GDC (T=0.2) collected runs for seed [1, 5]
For ogbn-arxiv-DICE-0.25-Soft Median GDC (T=0.2) collected runs for seed [1, 5]
For ogbn-arxiv-PRBCD-0.0-Soft Median GDC (T=0.2) collected runs for seed [5, 0]
For ogbn-arxiv-PRBCD-0.01-Soft Median GDC (T=0.2) collected runs for seed [5, 0]
For ogbn-arxiv-PRBCD-0.05-Soft Median GDC (T=0.2) collected runs for seed [5, 0]
For ogbn-arxiv-PRBCD-0.1-Soft Median GDC (T=0.2) collected runs for seed [5, 0]
For ogbn-arxiv-PRBCD-0.25-Soft Median GDC (T=0.2) collected runs for seed [5, 0]
For ogbn-products-PRBCD-0.0-Soft Median GDC (T=0.2) collected runs for seed [0]
For ogbn-products-PRBCD-0.01-Soft Median GDC (T=0.2) collected runs for seed [0]
For ogbn-products-PRBCD-0.05-Soft Medi

Unnamed: 0,Unnamed: 1,\textbf{Architecture},\rotatebox{90}{\textbf{Attack}},\makecell{\textbf{Frac.}\\\textbf{edges}\\\(\boldsymbol{\epsilon}\)},accuracy
0,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{DICE},0.00,0.714
1,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{DICE},0.01,0.712
2,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{DICE},0.05,0.707
3,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{DICE},0.10,0.699
4,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{DICE},0.25,0.676
...,...,...,...,...,...
620,\textbf{PubMed},\makecell{Vanilla\\GDC},\textbf{PR-BCD (ours)},0.00,0.784
621,\textbf{PubMed},\makecell{Vanilla\\GDC},\textbf{PR-BCD (ours)},0.01,0.756
622,\textbf{PubMed},\makecell{Vanilla\\GDC},\textbf{PR-BCD (ours)},0.05,0.686
623,\textbf{PubMed},\makecell{Vanilla\\GDC},\textbf{PR-BCD (ours)},0.10,0.633


In [162]:
df_accuracy = df[(df[epsilons_c] == 0) & (df[attack_c] == attack_map['PRBCD'])]
df_accuracy = df_accuracy.drop(columns=[attack_c, epsilons_c])
df_accuracy[dataset_c] = df_accuracy[dataset_c].apply(lambda dataset: rf'\rotatebox{{90}}{{{dataset}}}')
df_accuracy = df_accuracy.set_index([dataset_c, architecture_c])
df_accuracy.columns = pd.MultiIndex.from_product([['\textbf{Accuracy}'], [' ']])

df_accuracy

Unnamed: 0_level_0,Unnamed: 1_level_0,\textbf{Accuracy}
Unnamed: 0_level_1,\textbf{Architecture},Unnamed: 2_level_1
\rotatebox{90}{\textbf{Citeseer}},Jaccard GCN,0.714
\rotatebox{90}{\textbf{Citeseer}},RGCN,0.646
\rotatebox{90}{\textbf{Citeseer}},SVD GCN,0.641
\rotatebox{90}{\textbf{Citeseer}},Soft Median GDC,0.709
\rotatebox{90}{\textbf{Citeseer}},Soft Medoid GDC,0.707
\rotatebox{90}{\textbf{Citeseer}},Vanilla GCN,0.712
\rotatebox{90}{\textbf{Citeseer}},Vanilla GDC,0.709
\rotatebox{90}{\textbf{Cora ML}},Jaccard GCN,0.819
\rotatebox{90}{\textbf{Cora ML}},RGCN,0.8
\rotatebox{90}{\textbf{Cora ML}},SVD GCN,0.761


In [163]:
df[architecture_c] = df[architecture_c].astype("category")
df[architecture_c].cat.set_categories(
    [transform_label(l) for l in labels_to_plot],
    inplace=True
)
df[dataset_c] = df[dataset_c].astype("category")
df[dataset_c].cat.set_categories(
    dataset_order,
    inplace=True
)
df[dataset_c] = df[dataset_c].cat.rename_categories([rf'\rotatebox{{90}}{{{cat}}}' for cat in df[dataset_c].cat.categories])


df = df[df[epsilons_c].isin(epsilons)].copy()

df_piv = pd.pivot_table(
    df, 
    index=[dataset_c, architecture_c], 
    columns=[attack_c, epsilons_c], #['type', attack_c, ' '], 
    values='accuracy',
    aggfunc=lambda x: ' '.join(x)
)

df_piv = df_piv[[c for attack in attack_order for c in df_piv.columns if c[0] == attack]]

for epsilon, mark in zip(epsilons, epsilon_marks):
    current_columns = [c for c in df_piv.columns if c[1] == epsilon]
    df_piv[current_columns] = df_piv[current_columns].groupby(dataset_c).apply(
        partial(mark_best_and_second_best, first_mark=mark, second_mark='', axis=1, is_higher_better=False)
    )

def acc_to_mean_and_error(group: pd.DataFrame, with_error=True, decimal_places: int = 3):
    return calc_mean_and_error(group['accuracy'], group['seed'], with_error=with_error, decimal_places=decimal_places)

df_piv = df_piv.join(df_accuracy)
df_piv = df_piv.fillna('-')
df_piv

  vector[mask] = np.char.add(np.char.add(first_mark + '{', vector[mask]), '}')


Unnamed: 0_level_0,\textbf{Attack},\textbf{DICE},\textbf{DICE},\textbf{DICE},\textbf{DICE},\textbf{GANG (ours)},\textbf{GANG (ours)},\textbf{GANG (ours)},\textbf{GANG (ours)},\textbf{greedy FGSM},\textbf{greedy FGSM},\textbf{greedy FGSM},\textbf{greedy FGSM},\textbf{GR-BCD (ours)},\textbf{GR-BCD (ours)},\textbf{GR-BCD (ours)},\textbf{GR-BCD (ours)},\textbf{PGD},\textbf{PGD},\textbf{PGD},\textbf{PGD},\textbf{PR-BCD (ours)},\textbf{PR-BCD (ours)},\textbf{PR-BCD (ours)},\textbf{PR-BCD (ours)},\textbf{Accuracy}
Unnamed: 0_level_1,Frac. edges \(\boldsymbol{\epsilon}\),0.01,0.05,0.1,0.25,0.01,0.05,0.1,0.25,0.01,0.05,0.1,0.25,0.01,0.05,0.1,0.25,0.01,0.05,0.1,0.25,0.01,0.05,0.1,0.25,Unnamed: 26_level_1
Unnamed: 0_level_2,\textbf{Architecture},Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
\rotatebox{90}{\textbf{Cora ML}},Vanilla GCN,0.822,0.813,0.803,0.765,0.809,0.766,0.732,0.658,0.792,0.701,0.634,0.513,0.790,\textit{0.699},\underline{0.627},0.506,0.801,0.748,0.706,0.634,0.79,0.711,0.641,\textbf{0.498},0.825
\rotatebox{90}{\textbf{Cora ML}},Vanilla GDC,0.829,0.820,0.807,0.774,0.822,0.788,0.762,0.712,0.795,\textit{0.701},\underline{0.635},\textbf{0.516},0.798,0.709,0.640,0.542,0.808,0.759,0.727,0.682,0.794,0.717,0.649,0.526,0.831
\rotatebox{90}{\textbf{Cora ML}},SVD GCN,0.758,0.754,0.741,0.696,0.770,0.764,0.760,0.722,0.757,0.739,0.711,0.619,0.757,0.743,0.722,0.633,0.757,0.737,0.696,0.588,0.757,\textit{0.733},\underline{0.691},\textbf{0.570},0.761
\rotatebox{90}{\textbf{Cora ML}},Jaccard GCN,0.817,0.810,0.801,0.769,0.808,0.788,0.768,0.737,0.791,\textit{0.707},\underline{0.643},\textbf{0.523},0.789,0.716,0.655,0.557,0.800,0.757,0.721,0.666,0.79,0.724,0.660,0.532,0.819
\rotatebox{90}{\textbf{Cora ML}},RGCN,0.799,0.794,0.785,0.756,0.732,0.701,0.674,0.603,0.776,\textit{0.699},\underline{0.637},\textbf{0.517},0.774,0.706,0.643,0.529,0.780,0.739,0.709,0.638,0.777,0.712,0.658,0.528,0.8
\rotatebox{90}{\textbf{Cora ML}},Soft Medoid GDC,0.816,0.813,0.806,0.793,0.772,\textit{0.769},0.765,0.755,0.803,0.771,\underline{0.742},\textbf{0.684},0.806,0.788,0.775,0.755,0.813,0.800,0.794,0.780,0.806,0.780,0.758,0.725,0.817
\rotatebox{90}{\textbf{Cora ML}},Soft Median GDC,0.819,0.814,0.811,0.797,0.791,0.789,0.785,0.773,0.803,\textit{0.764},\underline{0.732},\textbf{0.674},0.808,0.782,0.767,0.742,0.813,0.801,0.791,0.776,0.805,0.776,0.750,0.711,0.819
\rotatebox{90}{\textbf{Citeseer}},Vanilla GCN,0.710,0.702,0.691,0.663,0.700,0.675,0.644,0.593,0.689,0.608,0.532,0.390,0.682,\textit{0.602},\underline{0.528},\textbf{0.368},0.696,0.649,0.610,0.538,0.685,0.608,0.544,0.410,0.712
\rotatebox{90}{\textbf{Citeseer}},Vanilla GDC,0.706,0.694,0.682,0.649,0.701,0.686,0.662,0.630,0.686,0.609,0.542,0.412,0.681,\textit{0.602},\underline{0.537},0.407,0.690,0.641,0.604,0.531,0.679,0.611,0.539,\textbf{0.405},0.709
\rotatebox{90}{\textbf{Citeseer}},SVD GCN,0.637,0.625,0.606,0.566,0.639,0.627,\underline{0.420},0.539,0.638,0.617,0.585,0.497,0.639,0.624,0.593,0.484,0.638,0.613,0.576,0.487,0.635,\textit{0.608},0.562,\textbf{0.464},0.641


In [164]:
print(df_piv\
    .sort_index(level=1, ascending=False, sort_remaining=False)\
    .sort_index(level=0, ascending=True, sort_remaining=True)\
    .to_latex(
        float_format='%.2f',
        caption=r'Perturbed accuracy for the proposed attacks (see Sections~\ref{sec:attackkdd}-\ref{sec:prbcd}) and baselines on all datasets (see Table~\ref{tab:datasets}). \(\epsilon\) denotes the fraction of edges perturbed (relative to the clean graph). The last column contains the clean accuracy. As this a work-in-progress report, the experiments for the defenses on the large datasets are due and on Products we did not optimize the hyperparameters for GANG. For each architecture we italicize the strongest attack where \(\epsilon=0.05\), underline where \(\epsilon=0.1\), and embolden where \(\epsilon=0.25\). From an attack perspective, a lower perturbed accuracy is better. We rerun the experiments with three different seeds. For OGB we use the provided data splits and otherwise we use random split with 20 nodes per class.', 
        label='tab:global',
        escape=False,
        multirow=True,
        multicolumn=True,
        column_format='llccccccccccccccccccccccccc'
    )
)

\begin{table}
\centering
\caption{Perturbed accuracy for the proposed attacks (see Sections~\ref{sec:attackkdd}-\ref{sec:prbcd}) and baselines on all datasets (see Table~\ref{tab:datasets}). \(\epsilon\) denotes the fraction of edges perturbed (relative to the clean graph). The last column contains the clean accuracy. As this a work-in-progress report, the experiments for the defenses on the large datasets are due and on Products we did not optimize the hyperparameters for GANG. For each architecture we italicize the strongest attack where \(\epsilon=0.05\), underline where \(\epsilon=0.1\), and embolden where \(\epsilon=0.25\). From an attack perspective, a lower perturbed accuracy is better. We rerun the experiments with three different seeds. For OGB we use the provided data splits and otherwise we use random split with 20 nodes per class.}
\label{tab:global}
\begin{tabular}{llccccccccccccccccccccccccc}
\toprule
                                  & \textbf{Attack} & \multicolumn{4}{l

# Compare losses

In [165]:
architecture_c = r'\textbf{Architecture}'
dataset_c = r'   '
attack_c = r'\rotatebox{90}{\textbf{Attack}}'
epsilons_c = r'\makecell{\textbf{Frac.}\\\textbf{edges}\\\(\boldsymbol{\epsilon}\)}'

epsilons = [0.01, 0.05, 0.1, 0.25]
epsilon_marks = ['', r'\textit', r'\underline', r'\textbf']

In [166]:
loss_c = r'\textbf{Loss}'

loss_map = {
    'CE': 'CE',
    'CW': 'CW',
    'SCE': 'SCE',
    'MCE': 'MCE',
    'tanhCW': 'tanh Margin',
}

def loss_str(attack: str, loss: str, stop_optimizing_if_label_flipped: bool) -> str:
    if isinstance(loss, str):
        return loss_map[loss]
    else:
        if attack == 'FGSM' and stop_optimizing_if_label_flipped:
            return loss_map['MCE']
        elif attack == 'PGD' and stop_optimizing_if_label_flipped:
            return loss_map['tanhCW']
        else:
            return loss_map['CE']
    
def transform_label(label: str):
    label = label.replace('\n', ' ').replace(' (T=0.5)','').replace(' (T=0.2)','').replace(' ', r'\\')
    return f'\makecell{{{label}}}'

In [167]:
df_results = [
    pd.DataFrame(r) 
    for r in df_experiments['result.results']
]
for df_result, (_, df_experiment) in zip(df_results, df_experiments.iterrows()):
    df_result['dataset'] = df_experiment['config.dataset']
    df_result['attack'] = df_experiment['config.attack']
    df_result['seed'] = df_experiment['config.seed']
    df_result['batch_id'] = df_experiment['batch_id']
    df_result['loss'] = loss_str(
        df_experiment['config.attack'],
        df_experiment['config.attack_params.loss_type'],
        df_experiment['config.attack_params.stop_optimizing_if_label_flipped']
    )
    df_result['novel_loss'] = (
        (df_experiment['config.attack_params.stop_optimizing_if_label_flipped'] == True)
        | (df_experiment['config.attack_params.loss_type'] == 'MCE')
        | (df_experiment['config.attack_params.loss_type'] == 'tanhCW')
    )

df_results = pd.concat(df_results, ignore_index=True)
df_results = df_results.sort_values('batch_id')
df_results = df_results.drop_duplicates([
    c for c in df_results.columns if c != 'batch_id' and c != 'accuracy'
], keep='last')

df_results

Unnamed: 0,label,epsilon,accuracy,dataset,attack,seed,batch_id,loss,novel_loss
2307,Vanilla GCN,0.10,0.600748,ogbn-products,GANG,0,3,CE,False
2305,Vanilla GDC,0.05,0.699293,ogbn-products,GANG,0,3,CE,False
2301,Vanilla GDC,0.01,0.705477,ogbn-products,GANG,0,3,CE,False
2309,Vanilla GDC,0.10,0.694208,ogbn-products,GANG,0,3,CE,False
2299,Vanilla GCN,0.01,0.712638,ogbn-products,GANG,0,3,CE,False
...,...,...,...,...,...,...,...,...,...
11820,Vanilla GCN,0.00,0.827273,cora_ml,PGD,5,13,SCE,False
11821,Vanilla GCN,0.01,0.818182,cora_ml,PGD,5,13,SCE,False
11822,Vanilla GCN,0.05,0.791700,cora_ml,PGD,5,13,SCE,False
11824,Vanilla GCN,0.25,0.631621,cora_ml,PGD,5,13,SCE,False


In [168]:
df = []

for (dataset, attack, label, epsilon, loss), df_group in df_results[
    df_results['attack'].isin(['FGSM', 'PGD'])
    & df_results['label'].isin(labels_to_plot)
].groupby(['dataset', 'attack', 'label', 'epsilon', 'loss']):
    if len(df_group.seed.unique()) != 3:
        print(f'For {dataset}-{attack}-{epsilon} collected runs for seed {df_group.seed.tolist()}')

    accuracy = calc_mean_and_error(df_group.accuracy, df_group.seed, with_error=False, decimal_places=4)
    df.append({
        dataset_c: dataset_map[dataset],
        architecture_c: transform_label(label),
        attack_c: attack_map[attack],
        epsilons_c: epsilon,
        'accuracy': accuracy,
        loss_c: loss
    })
    
df = pd.DataFrame(df)

df = df[
    df[attack_c].isin([attack_map['PGD'], attack_map['FGSM']])
    & ~((df[attack_c] == attack_map['PGD']) & (df[dataset_c] == dataset_map['pubmed']))
    & ~((df[attack_c] == attack_map['FGSM']) & (df[dataset_c] == dataset_map['pubmed']))
]

df = df[df[epsilons_c].isin(epsilons)].copy()

df

Unnamed: 0,Unnamed: 1,\textbf{Architecture},\rotatebox{90}{\textbf{Attack}},\makecell{\textbf{Frac.}\\\textbf{edges}\\\(\boldsymbol{\epsilon}\)},accuracy,\textbf{Loss}
5,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{greedy FGSM},0.01,0.7091,CE
6,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{greedy FGSM},0.01,0.7018,CW
7,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{greedy FGSM},0.01,0.6959,MCE
8,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{greedy FGSM},0.01,0.7046,SCE
9,\textbf{Citeseer},\makecell{Jaccard\\GCN},\textbf{greedy FGSM},0.01,0.6970,tanh Margin
...,...,...,...,...,...,...
695,\textbf{Cora ML},\makecell{Vanilla\\GDC},\textbf{PGD},0.25,0.5953,CE
696,\textbf{Cora ML},\makecell{Vanilla\\GDC},\textbf{PGD},0.25,0.7198,CW
697,\textbf{Cora ML},\makecell{Vanilla\\GDC},\textbf{PGD},0.25,0.6822,MCE
698,\textbf{Cora ML},\makecell{Vanilla\\GDC},\textbf{PGD},0.25,0.6590,SCE


In [169]:
df[architecture_c] = df[architecture_c].astype("category")
df[architecture_c].cat.set_categories(
    [transform_label(l) for l in labels_to_plot],
    inplace=True
)
df[dataset_c] = df[dataset_c].astype("category")
df[dataset_c].cat.set_categories(
    dataset_order,
    inplace=True
)
df[attack_c] = df[attack_c].astype("category")
df[attack_c].cat.set_categories(
    list(attack_map.values()),
    inplace=True
)
df[attack_c] = df[attack_c].cat.rename_categories([rf'\rotatebox{{90}}{{{cat}}}' for cat in df[attack_c].cat.categories])

df_piv = pd.pivot_table(
    df, 
    index=[attack_c, epsilons_c, loss_c], 
    columns=[dataset_c, architecture_c],
    values='accuracy',
    aggfunc=lambda x: ' '.join(x)
)

#df_piv = df_piv[[c for attack in attack_order for c in df_piv.columns if c[0] == attack]]

#for epsilon, mark in zip(epsilons, epsilon_marks):
#    current_columns = [c for c in df_piv.columns if c[1] == epsilon]
#    df_piv[current_columns] = df_piv[current_columns].groupby(dataset_c).apply(
#        partial(mark_best_and_second_best, first_mark=mark, second_mark='', axis=1, is_higher_better=False)
#    )

#def acc_to_mean_and_error(group: pd.DataFrame, with_error=True, decimal_places: int = 3):
#    return calc_mean_and_error(group['accuracy'], group['seed'], with_error=with_error, decimal_places=decimal_places)

#df_piv = df_piv.join(df_accuracy)
df_piv = df_piv.fillna('-')

ce_acc = df_piv.xs(loss_map['CE'], level=2, drop_level=False).values
nonce_acc = np.vstack([
    df_piv.xs(loss_map[loss], level=2, drop_level=False).values
    for loss in loss_map.keys() if loss != loss_map['CE']
])

row_idx = len(loss_map) * np.arange(df_piv.shape[0] // len(loss_map))
row_idx = (row_idx[:, None] 
           + df_piv.groupby(level=[0,1]).aggregate(lambda col: col.astype(float).argmin()).values
          ).flatten()

col_idx = np.tile(np.arange(df_piv.shape[1]), df_piv.shape[0] // len(loss_map))

mask = np.zeros(df_piv.shape).astype(bool)
mask[row_idx, col_idx] = True

df_piv = df_piv.mask(
    pd.DataFrame(mask, columns=df_piv.columns, index=df_piv.index),
    df_piv.applymap(lambda elem: rf'{epsilon_marks[-1]}{{{elem}}}')
)
df_piv

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,\textbf{Cora ML},\textbf{Cora ML},\textbf{Cora ML},\textbf{Cora ML},\textbf{Cora ML},\textbf{Cora ML},\textbf{Cora ML},\textbf{Citeseer},\textbf{Citeseer},\textbf{Citeseer},\textbf{Citeseer},\textbf{Citeseer},\textbf{Citeseer},\textbf{Citeseer}
Unnamed: 0_level_1,Unnamed: 1_level_1,\textbf{Architecture},\makecell{Vanilla\\GCN},\makecell{Vanilla\\GDC},\makecell{SVD\\GCN},\makecell{Jaccard\\GCN},\makecell{RGCN},\makecell{Soft\\Medoid\\GDC},\makecell{Soft\\Median\\GDC},\makecell{Vanilla\\GCN},\makecell{Vanilla\\GDC},\makecell{SVD\\GCN},\makecell{Jaccard\\GCN},\makecell{RGCN},\makecell{Soft\\Medoid\\GDC},\makecell{Soft\\Median\\GDC}
\rotatebox{90}{\textbf{Attack}},\makecell{\textbf{Frac.}\\\textbf{edges}\\\(\boldsymbol{\epsilon}\)},\textbf{Loss},Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
\rotatebox{90}{\textbf{greedy FGSM}},0.01,CE,0.8087,0.8144,0.7576,0.8066,0.7864,0.8061,0.8092,0.7052,0.7000,0.6401,0.7091,0.6389,0.7045,0.7061
\rotatebox{90}{\textbf{greedy FGSM}},0.01,CW,0.8079,0.8145,0.7573,0.8041,0.7843,0.8149,0.8167,0.6966,0.6916,0.6394,0.7018,0.6312,0.7070,0.7077
\rotatebox{90}{\textbf{greedy FGSM}},0.01,MCE,\textbf{0.7859},\textbf{0.7953},0.7573,\textbf{0.7871},\textbf{0.7730},0.8070,0.8078,\textbf{0.6850},\textbf{0.6832},\textbf{0.6376},\textbf{0.6959},\textbf{0.6305},0.7037,0.7046
\rotatebox{90}{\textbf{greedy FGSM}},0.01,SCE,0.8124,0.8204,0.7580,0.8084,0.7872,0.8157,0.8170,0.7009,0.6975,0.6387,0.7046,0.6328,0.7075,0.7089
\rotatebox{90}{\textbf{greedy FGSM}},0.01,tanh Margin,0.7920,0.7953,\textbf{0.7567},0.7905,0.7756,\textbf{0.8033},\textbf{0.8025},0.6895,0.6856,0.6383,0.6970,0.6335,\textbf{0.7029},\textbf{0.7021}
\rotatebox{90}{\textbf{greedy FGSM}},0.05,CE,0.7577,0.7586,0.7414,0.7605,0.7428,0.7722,0.7722,0.6693,0.6594,0.6244,0.6799,0.6077,\textbf{0.6852},0.6831
\rotatebox{90}{\textbf{greedy FGSM}},0.05,CW,0.7378,0.7531,0.7445,0.7465,0.7318,0.8054,0.8016,0.6332,0.6385,0.6225,0.6560,0.5818,0.7029,0.7039
\rotatebox{90}{\textbf{greedy FGSM}},0.05,MCE,\textbf{0.6908},0.7045,0.7426,0.7116,0.7004,0.7885,0.7850,\textbf{0.6064},\textbf{0.6036},0.6212,0.6410,\textbf{0.5815},0.6952,0.6911
\rotatebox{90}{\textbf{greedy FGSM}},0.05,SCE,0.7623,0.7750,0.7444,0.7647,0.7466,0.8083,0.8062,0.6519,0.6565,0.6205,0.6693,0.5879,0.7061,0.7059
\rotatebox{90}{\textbf{greedy FGSM}},0.05,tanh Margin,0.7011,\textbf{0.7007},\textbf{0.7389},\textbf{0.7071},\textbf{0.6993},\textbf{0.7713},\textbf{0.7639},0.6082,0.6094,\textbf{0.6168},\textbf{0.6348},0.5927,0.6889,\textbf{0.6802}


In [170]:
print(df_piv\
    .sort_index(level=1, ascending=False, sort_remaining=False)\
    .sort_index(level=0, ascending=True, sort_remaining=True)\
    .to_latex(
        float_format='%.2f',
        caption=r'Perturbed accuracy comparing the conventional losses with our loss. \(\epsilon\) denotes the fraction of edges perturbed (relative to the clean graph). We use random split with 20 nodes per class.', 
        label='tab:losscompare',
        escape=False,
        multirow=True,
        multicolumn=True,
        column_format='lcl|ccccccc|ccccccc'
    )
)

\begin{table}
\centering
\caption{Perturbed accuracy comparing the conventional losses with our loss. \(\epsilon\) denotes the fraction of edges perturbed (relative to the clean graph). We use random split with 20 nodes per class.}
\label{tab:losscompare}
\begin{tabular}{lcl|ccccccc|ccccccc}
\toprule
                             &      &     & \multicolumn{7}{l}{\textbf{Cora ML}} & \multicolumn{7}{l}{\textbf{Citeseer}} \\
                             &      & \textbf{Architecture} & \makecell{Vanilla\\GCN} & \makecell{Vanilla\\GDC} & \makecell{SVD\\GCN} & \makecell{Jaccard\\GCN} &  \makecell{RGCN} & \makecell{Soft\\Medoid\\GDC} & \makecell{Soft\\Median\\GDC} & \makecell{Vanilla\\GCN} & \makecell{Vanilla\\GDC} & \makecell{SVD\\GCN} & \makecell{Jaccard\\GCN} &  \makecell{RGCN} & \makecell{Soft\\Medoid\\GDC} & \makecell{Soft\\Median\\GDC} \\
\rotatebox{90}{\textbf{Attack}} & \makecell{\textbf{Frac.}\\\textbf{edges}\\\(\boldsymbol{\epsilon}\)} & \textbf{Loss} &                         &   

# Datasets

In [35]:
def value_to_storage(val: float, decimals: int = 2):
    if val / 1e3 < 1:
        return f'{val:.{decimals}f} B'
    if val / 1e6 < 1:
        return f'{val / 1e3:.{decimals}f} kB'
    if val / 1e9 < 1:
        return f'{val / 1e6:.{decimals}f} MB'
    if val / 1e12 < 1:
        return f'{val / 1e9:.{decimals}f} GB'
    if val / 1e15 < 1:
        return f'{val / 1e12:.{decimals}f} TB'
    if val / 1e18 < 1:
        return f'{val / 1e15:.{decimals}f} PB'
    raise ValueError(f'{val} is too big for Peta!!!')

In [36]:
from ogb.nodeproppred import PygNodePropPredDataset

from rgnn_at_scale.data import load_dataset

dataset_df = []
for dataset in tqdm.tqdm(['cora_ml', 'citeseer', 'pubmed', 'ogbn-arxiv', 'ogbn-products', 'ogbn-papers100M']):
    if dataset.startswith('ogbn'):
        pyg_dataset = PygNodePropPredDataset(root='/nfs/staff-ssd/geisler/dontrobme/datasets', name=dataset)
        nnodes = pyg_dataset[0].x.shape[0]
        nedges = pyg_dataset[0].edge_index.shape[1]
        nfeatures = pyg_dataset[0].x.shape[1]
    else:
        graph = load_dataset(dataset, '/nfs/staff-ssd/geisler/dontrobme/datasets')
        #graph = load_and_standardize(dataset)
        nnodes = graph.adj_matrix.shape[0]
        nedges = graph.adj_matrix.nnz
        nfeatures = graph.attr_matrix.shape[1]
    dataset_df.append({
            r'\textbf{Dataset}': dataset_map[dataset].replace(r'rotatebox{90}', r'textbf'),
            r'\textbf{\#Nodes $n$}': f'{nnodes:,}',
            r'\textbf{\#Edges $e$}': f'{nedges:,}',
            r'\textbf{\#Features $d$}': f'{nfeatures:,}',
            #r'\textbf{\#Possible edges}': f'{nnodes ** 2:.3E}',
            r'\textbf{Size (dense)}': value_to_storage(4 * nnodes ** 2),
            r'\textbf{Size (sparse)}': value_to_storage(2*8*nedges + 4*nedges),
        })
dataset_df = pd.DataFrame(dataset_df).set_index(r'\textbf{Dataset}')
dataset_df

100%|██████████| 6/6 [00:52<00:00,  8.72s/it]


Unnamed: 0_level_0,\textbf{\#Nodes $n$},\textbf{\#Edges $e$},\textbf{\#Features $d$},\textbf{Size (dense)},\textbf{Size (sparse)}
\textbf{Dataset},Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
\textbf{Cora ML},2995,8416,2879,35.88 MB,168.32 kB
\textbf{Citeseer},3312,4715,3703,43.88 MB,94.30 kB
\textbf{PubMed},19717,88648,500,1.56 GB,1.77 MB
\textbf{arXiv},169343,1166243,128,114.71 GB,23.32 MB
\textbf{Products},2449029,123718280,100,23.99 TB,2.47 GB
\textbf{Papers 100M},111059956,1615685872,128,49.34 PB,32.31 GB


In [37]:
print(dataset_df
    #.sort_index()
    .to_latex(
        float_format='%.2f',
        caption='Statistics of the used datasets. For the dense adjacency matrix we assume that each elements is represented by 4 bytes. In the sparse case we use two 8 byte integer pointers and a 4 bytes float value.', 
        label='tab:datasets',
        escape=False
    )
)

\begin{table}
\centering
\caption{Statistics of the used datasets. For the dense adjacency matrix we assume that each elements is represented by 4 bytes. In the sparse case we use two 8 byte integer pointers and a 4 bytes float value.}
\label{tab:datasets}
\begin{tabular}{llllll}
\toprule
{} & \textbf{\#Nodes $n$} & \textbf{\#Edges $e$} & \textbf{\#Features $d$} & \textbf{Size (dense)} & \textbf{Size (sparse)} \\
\textbf{Dataset}     &                      &                      &                         &                       &                        \\
\midrule
\textbf{Cora ML}     &                2,995 &                8,416 &                   2,879 &              35.88 MB &              168.32 kB \\
\textbf{Citeseer}    &                3,312 &                4,715 &                   3,703 &              43.88 MB &               94.30 kB \\
\textbf{PubMed}      &               19,717 &               88,648 &                     500 &               1.56 GB &                1.77 

# PPR

In [38]:
def ppr(A):
    A_norm = A / A.sum(-1)[:, None]
    return alpha * torch.inverse(torch.eye(4) + (alpha - 1) * A_norm)

def update(A_dense, ppr_original, u, v):
    i = torch.nonzero(u.flatten()).item()
    
    row = A_dense[i] + v
    row = row / row.sum()
    A_norm = A_dense[i] / A_dense[i].sum()
    row_diff = row - A_norm
    row_diff_norm = (alpha - 1) * row_diff
    print(row_diff_norm)

    P_inv = (1 / alpha) * ppr_original
    P_uv_inv = P_inv - (P_inv @ u @ row_diff_norm @ P_inv) / (1 + row_diff_norm @ P_inv @ u)
    ppr_update = alpha * P_uv_inv
    
    return ppr_update

alpha = 0.15
A_dense_list = [
    torch.tensor([[0, 1, 0, 1],
                  [1, 0, 1, 0],
                  [0, 0, 0, 1],
                  [1, 1, 1, 0]],
                 dtype=torch.float32),    
    torch.tensor([[0, 1, 0, 1],
                  [1, 0, 1, 0],
                  [0, 1, 0, 1],
                  [1, 1, 1, 0]],
                 dtype=torch.float32),
    torch.tensor([[0, 1, 0, 1],
                  [1, 0, 1, 0],
                  [0, 0, 0, 1],
                  [1, 1, 1, 0]],
                 dtype=torch.float32),    
    torch.tensor([[0, 1, 0, 1],
                  [1, 0, 1, 0],
                  [0, 1, 0, 1],
                  [1, 1, 1, 0]],
                 dtype=torch.float32)
]


u_list = [
    torch.tensor([[0], [0], [1], [0]], dtype=torch.float32),
    torch.tensor([[0], [0], [1], [0]], dtype=torch.float32),
    torch.tensor([[1], [0], [0], [0]], dtype=torch.float32),
    torch.tensor([[0], [1], [0], [0]], dtype=torch.float32),
    torch.tensor([[0], [0], [0], [1]], dtype=torch.float32)
]
v_list = [
    torch.tensor([[0.3, 0.1, 0, 0.3]], dtype=torch.float32, requires_grad=True),
    torch.tensor([[0.3, 0.1, 0, 0.3]], dtype=torch.float32, requires_grad=True),
    torch.tensor([[0.3, 0.1, 12, 0.3]], dtype=torch.float32, requires_grad=True),
    torch.tensor([[0.3, 0.1, 0.8, 0.3]], dtype=torch.float32, requires_grad=True),
    torch.tensor([[0.3, 0.1, 0.8, 0.3]], dtype=torch.float32, requires_grad=True)
]


for A_dense, u, v in zip(A_dense_list, u_list, v_list):

    ppr_original = ppr(A_dense)
    print(ppr_original)

    A_pert = A_dense + u@v
    print(A_pert)

    ppr_target = ppr(A_dense + u@v)
    print('Soll', ppr_target)

    ppr_update = update(A_dense, ppr_original, u, v)
    print('Ist', ppr_update)

    ppr_update.sum().backward()
    print(v.grad)

    assert torch.allclose(ppr_update, ppr_target)

tensor([[0.3235, 0.2182, 0.1735, 0.2849],
        [0.2093, 0.3146, 0.2093, 0.2669],
        [0.1690, 0.1690, 0.3190, 0.3430],
        [0.1988, 0.1988, 0.1988, 0.4035]])
tensor([[0.0000, 1.0000, 0.0000, 1.0000],
        [1.0000, 0.0000, 1.0000, 0.0000],
        [0.3000, 0.1000, 0.0000, 1.3000],
        [1.0000, 1.0000, 1.0000, 0.0000]], grad_fn=<AddBackward0>)
Soll tensor([[0.3452, 0.2280, 0.1697, 0.2570],
        [0.2355, 0.3264, 0.2048, 0.2332],
        [0.2090, 0.1871, 0.3122, 0.2917],
        [0.2238, 0.2101, 0.1946, 0.3716]], grad_fn=<MulBackward0>)
tensor([[-0.1500, -0.0500, -0.0000,  0.2000]], grad_fn=<MulBackward0>)
Ist tensor([[0.3452, 0.2280, 0.1697, 0.2570],
        [0.2355, 0.3264, 0.2048, 0.2332],
        [0.2090, 0.1871, 0.3122, 0.2917],
        [0.2238, 0.2101, 0.1946, 0.3716]], grad_fn=<MulBackward0>)
tensor([[2.3842e-07, 4.7684e-07, 4.7684e-07, 2.3842e-07]])
tensor([[0.3288, 0.2768, 0.1788, 0.2157],
        [0.2157, 0.3853, 0.2157, 0.1833],
        [0.1788, 0.2768, 0.32

	nonzero(Tensor input, *, Tensor out)
Consider using one of the following signatures instead:
	nonzero(Tensor input, *, bool as_tuple) (Triggered internally at  /opt/conda/conda-bld/pytorch_1595629411241/work/torch/csrc/utils/python_arg_parser.cpp:766.)
  i = torch.nonzero(u.flatten()).item()


In [39]:
for A_dense, u, v in zip(A_dense_list, u_list, v_list):

    ppr_original = ppr(A_dense)
    print(ppr_original)

    A_pert = A_dense + u@v
    print(A_pert)

    ppr_target = ppr(A_dense + u@v)
    print('Soll', ppr_target)

    ppr_update = update(A_dense, ppr_original, u, v)
    print('Ist', ppr_update)

    ppr_update.sum().backward()
    print(v.grad)

    assert torch.allclose(ppr_update, ppr_target)

tensor([[0.3235, 0.2182, 0.1735, 0.2849],
        [0.2093, 0.3146, 0.2093, 0.2669],
        [0.1690, 0.1690, 0.3190, 0.3430],
        [0.1988, 0.1988, 0.1988, 0.4035]])
tensor([[0.0000, 1.0000, 0.0000, 1.0000],
        [1.0000, 0.0000, 1.0000, 0.0000],
        [0.3000, 0.1000, 0.0000, 1.3000],
        [1.0000, 1.0000, 1.0000, 0.0000]], grad_fn=<AddBackward0>)
Soll tensor([[0.3452, 0.2280, 0.1697, 0.2570],
        [0.2355, 0.3264, 0.2048, 0.2332],
        [0.2090, 0.1871, 0.3122, 0.2917],
        [0.2238, 0.2101, 0.1946, 0.3716]], grad_fn=<MulBackward0>)
tensor([[-0.1500, -0.0500, -0.0000,  0.2000]], grad_fn=<MulBackward0>)
Ist tensor([[0.3452, 0.2280, 0.1697, 0.2570],
        [0.2355, 0.3264, 0.2048, 0.2332],
        [0.2090, 0.1871, 0.3122, 0.2917],
        [0.2238, 0.2101, 0.1946, 0.3716]], grad_fn=<MulBackward0>)
tensor([[4.7684e-07, 9.5367e-07, 9.5367e-07, 4.7684e-07]])
tensor([[0.3288, 0.2768, 0.1788, 0.2157],
        [0.2157, 0.3853, 0.2157, 0.1833],
        [0.1788, 0.2768, 0.32