This script evaluates the performance of different graph-aware architectures in a node classification problem. Several datasets are employed paying special attention to the homophily ratio.

In [6]:
import time
import numpy as np
from pandas import DataFrame
import matplotlib.pyplot as plt
import dgl
import networkx as nx
import torch
import torch.nn as nn
from IPython.display import display

import utils
from gsp_utils.baselines_archs import MLP, GAT, GCNN, GraphSAGE, GIN
from gsp_utils.baselines_models import NodeClassModel, GF_NodeClassModel
from gsp_utils.data import normalize_gso
from src.arch import GFGCN, GFGCNLayer, GFGCN_noh_Layer, GFGCN_Spows, Dual_GFGCN, NV_GFGCN

# SEED = 0
SEED = 15
PATH = 'results/try_datasets'
SAVE = True
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

torch.manual_seed(SEED)

cuda:0


<torch._C.Generator at 0x7fd6435827b0>

## Auxiliary functions

In [3]:
def select_GSO(A, gso_type, self_loops=True, norm=False, dev='cpu'):
    if gso_type == 'A-dgl':
        if self_loops:
            S = dgl.from_networkx(nx.from_numpy_array(A)).add_self_loop()
        else:
            S = dgl.from_networkx(nx.from_numpy_array(A))
        
    elif gso_type == 'A':
        if norm:
            S = torch.Tensor(normalize_gso(A, 'both', add_id=False))
        else:
            S = torch.Tensor(A)
        
    elif gso_type == 'none' or gso_type is None:
        return None
    else:
        raise ValueError('Unknown type of GSO')

    return S.to(dev)

def run_exp(Exps, datasets, n_runs, device):
    accs_test = np.zeros((len(Exps), len(datasets), n_runs))
    accs_max_val = np.zeros_like(accs_test)
    ellapsed_times = np.zeros_like(accs_test)

    for j, dataset in enumerate(datasets):
        print(dataset)

        for i in range(n_runs):
            print(f'{i}:', end=' ')

            A, feat, labels, n_class, masks = utils.get_data_dgl(dataset, dev=device, idx=i%10)
            N = A.shape[0]

            # Default arguments
            d_arc_args = {'in_dim': feat.shape[1], 'out_dim': n_class,}

            for k, exp in enumerate(Exps):
                arc_args = {**d_arc_args, **exp['arc_args']}

                arch = exp['arch'](**arc_args)
                S_sel = select_GSO(A, **exp['sel_GSO'], dev=device)
                model = exp['model'](arch, S_sel, masks=masks, **exp['mod_args'], device=device)

                t_i = time.time()
                loss, acc = model.train(feat, labels, **exp['train_args'])
                ellapsed_t = (time.time()-t_i)/60
            
                accs_test[k,j,i] = model.test(feat, model.S, labels, masks['test'])
                accs_max_val[k,j,i] = acc["test"][np.argmax(acc["val"])]
                ellapsed_times[k,j,i] = ellapsed_t

                print(f'{accs_test[k,j,i]:.3f} ({ellapsed_times[k,j,i]:.3f})', end=' - ')
    
    return accs_test, accs_max_val, ellapsed_times

def print_full_results(accs, ellapsed_time, datasets, exps):
    mean_accs = accs.mean(axis=2)
    med_accs = np.median(accs, axis=2)
    std_accs = accs.std(axis=2)
    mean_t = ellapsed_time.mean(axis=2)

    for i, dataset_name in enumerate(datasets):
        graph = getattr(dgl.data, dataset_name)(verbose=False)[0]
        edge_hom = dgl.edge_homophily(graph, graph.ndata['label'])

        print(f'{dataset_name} (homophily ratio: {edge_hom:.3f})')
        for j, exp in enumerate(exps):
            print(f'\t- {exp["leg"]}:\tmean: {mean_accs[j,i]:.3f} - std: {std_accs[j,i]:.4f} - med: {med_accs[j,i]:.3f} - time: {mean_t[j,i]:.2f} mins')
        
        print()

def summary_table(accs, datasets, exps, median=False):
    mean_accs = accs.mean(axis=2)
    cols_name = []
    for dataset_name in datasets:
        graph = getattr(dgl.data, dataset_name)(verbose=False)[0]
        edge_hom = dgl.edge_homophily(graph, graph.ndata['label'])
        cols_name.append(f'{dataset_name} ({edge_hom:.2f})')

    index_name = [exp['leg'] for exp in exps]

    return DataFrame(mean_accs, columns=cols_name, index=index_name)


## Heterophilicc datasets

In [5]:
# DATASETS = ['TexasDataset',  'WisconsinDataset', 'CornellDataset', 'ChameleonDataset', 'CoraGraphDataset', 'CiteseerGraphDataset']

DATASETS = ['TexasDataset',  'WisconsinDataset', 'CornellDataset', 'ChameleonDataset']
N_RUNS = 10
ACT = nn.ReLU()
LAST_ACT = nn.Softmax(dim=1)
LOSS_FN = nn.CrossEntropyLoss()


EXPS = [
    ### Our models
    {'arch': GFGCN_Spows, 'arc_args': {'hid_dim': 32, 'n_layers': 3, 'K': 2, 'act': ACT, 'dev': device,
     'l_act': LAST_ACT, 'dropout': .5,}, 'sel_GSO': {'gso_type': 'A', 'norm': True}, 'model': GF_NodeClassModel,
     'mod_args': {'loss': LOSS_FN, 'K': 2}, 'train_args': {'epochs': 200, 'epochs_h': 5, 'epochs_W': 25, 'lr': .005, 'wd': 5e-4},
     'leg': 'AFGNN-normH'},

    {'arch': GFGCN, 'arc_args': {'diff_layer': GFGCNLayer, 'hid_dim': 32, 'n_layers': 2, 'K': 3, 'act': ACT, 'h0': 1,
     'l_act': LAST_ACT, 'dropout': .25,}, 'sel_GSO': {'gso_type': 'A', 'norm': False}, 'model': GF_NodeClassModel,
     'mod_args': {'loss': LOSS_FN, 'K': 3}, 'train_args': {'epochs': 200, 'epochs_h': 25, 'epochs_W': 25, 'lr': .005, 'wd': .001},
     'leg': 'AFGNN'},

    {'arch': GFGCN, 'arc_args': {'diff_layer': GFGCNLayer, 'hid_dim': 50, 'n_layers': 3, 'K': 2, 'act': ACT, 'h0': 1,
     'l_act': LAST_ACT, 'dropout': .25,}, 'sel_GSO': {'gso_type': 'A', 'norm': True}, 'model': GF_NodeClassModel,
     'mod_args': {'loss': LOSS_FN, 'K': 2}, 'train_args': {'epochs': 200, 'epochs_h': 10, 'epochs_W': 10, 'lr': .005, 'wd': .001},
     'leg': 'AFGNN-normA'},

    # ### Baselines
    {'arch': GFGCN, 'arc_args': {'diff_layer': GFGCN_noh_Layer, 'hid_dim': 32, 'n_layers': 2, 'K': 2, 'act': ACT, 
     'l_act': LAST_ACT, 'dropout': .25,}, 'sel_GSO': {'gso_type': 'A', 'norm': True}, 'model': NodeClassModel,
     'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 500, 'lr': .005, 'wd': .01}, 'leg': 'W-GCN-norm'},

    # # GCNNs - Identity?
    {'arch': GCNN, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'norm': 'both'}, 'mod_args': {'loss': LOSS_FN},
     'sel_GSO': {'gso_type': 'A-dgl'}, 'model': NodeClassModel, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4}, 'leg': 'Kipf-norm'},
    {'arch': GCNN, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'norm': 'none'}, 'sel_GSO': {'gso_type': 'A-dgl'},
     'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4}, 'leg': 'Kipf'},
    # MLP - Identity?
    {'arch': MLP, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'n_layers': 2}, 'sel_GSO': {'gso_type': 'none'},
     'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4}, 'leg': 'MLP'},
    # GAT
    {'arch': GAT, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'num_heads': 2, 'gat_params': {'attn_drop': 0}},
     'sel_GSO': {'gso_type': 'A-dgl', 'self_loops': False}, 'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200,
     'lr': .01, 'wd': 5e-4}, 'leg': 'GAT'},
    {'arch': GraphSAGE, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'n_layers': 2,},
     'sel_GSO': {'gso_type': 'A-dgl', 'self_loops': False}, 'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4},
     'leg': 'SAGE'},
    {'arch': GIN, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'n_layers': 2, 'mlp_layers': 2},
     'sel_GSO': {'gso_type': 'A-dgl', 'self_loops': False}, 'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4},
     'leg': 'GIN'},
]


accs_test_het, accs_max_val_het, ellapsed_times_het = run_exp(EXPS, DATASETS, N_RUNS, device)

table_accs_het = summary_table(accs_test_het, DATASETS, EXPS)
table_accs_maxval_het = summary_table(accs_max_val_het, DATASETS, EXPS)
table_time_het= summary_table(ellapsed_times_het, DATASETS, EXPS)

TexasDataset
0: 0.730 (0.012) - WisconsinDataset
0: 0.549 (0.020) - TexasDataset (homophily ratio: 0.108)
	- AFGNN-normH:	mean: 0.838 - std: 0.0000 - med: 0.838 - time: 0.38 mins
	- AFGNN:	mean: 0.811 - std: 0.0000 - med: 0.811 - time: 0.28 mins
	- AFGNN-normA:	mean: 0.784 - std: 0.0000 - med: 0.784 - time: 0.16 mins
	- W-GCN-norm:	mean: 0.838 - std: 0.0000 - med: 0.838 - time: 0.02 mins
	- Kipf-norm:	mean: 0.622 - std: 0.0000 - med: 0.622 - time: 0.02 mins
	- Kipf:	mean: 0.595 - std: 0.0000 - med: 0.595 - time: 0.02 mins
	- MLP:	mean: 0.784 - std: 0.0000 - med: 0.784 - time: 0.01 mins
	- GAT:	mean: 0.703 - std: 0.0000 - med: 0.703 - time: 0.04 mins
	- SAGE:	mean: 0.838 - std: 0.0000 - med: 0.838 - time: 0.02 mins
	- GIN:	mean: 0.730 - std: 0.0000 - med: 0.730 - time: 0.01 mins

WisconsinDataset (homophily ratio: 0.196)
	- AFGNN-normH:	mean: 0.745 - std: 0.0000 - med: 0.745 - time: 0.31 mins
	- AFGNN:	mean: 0.863 - std: 0.0000 - med: 0.863 - time: 0.30 mins
	- AFGNN-normA:	mean: 0.824 

In [None]:
if SAVE:
    timestr = time.strftime("%Y%m%d-%H%M")
    table_accs_het.to_csv(PATH + 'heterophilic' + timestr)

print_full_results(accs_test_het, ellapsed_times_het, DATASETS, EXPS)
display(table_accs_het)

## Homophilicc datasets

In [None]:
DATASETS = ['CoraGraphDataset', 'CiteseerGraphDataset']
N_RUNS = 10
ACT = nn.ReLU()
LAST_ACT = nn.Identity()
LOSS_FN = nn.CrossEntropyLoss()


EXPS = [
    ### Our models
    {'arch': GFGCN, 'arc_args': {'diff_layer': GFGCNLayer, 'hid_dim': 64, 'n_layers': 2, 'K': 3, 'act': ACT, 'h0': 1,
     'l_act': nn.LogSoftmax(dim=1), 'dropout': .25,}, 'sel_GSO': {'gso_type': 'A', 'norm': False}, 'model': GF_NodeClassModel,
     'mod_args': {'loss': LOSS_FN, 'K': 3}, 'train_args': {'epochs': 200, 'epochs_h': 25, 'epochs_W': 25, 'lr': .01, 'wd': .001},
     'leg': 'AFGNN'},

    {'arch': GFGCN, 'arc_args': {'diff_layer': GFGCNLayer, 'hid_dim': 64, 'n_layers': 2, 'K': 3, 'act': ACT, 'h0': 1,
     'l_act': nn.LogSoftmax(dim=1), 'dropout': .25,}, 'sel_GSO': {'gso_type': 'A', 'norm': True}, 'model': GF_NodeClassModel,
     'mod_args': {'loss': LOSS_FN, 'K': 3}, 'train_args': {'epochs': 200, 'epochs_h': 25, 'epochs_W': 25, 'lr': .01, 'wd': .001},
     'leg': 'AFGNN-normA'},

    {'arch': GFGCN_Spows, 'arc_args': {'hid_dim': 32, 'n_layers': 3, 'K': 2, 'act': ACT, 'dev': device,
     'l_act': LAST_ACT, 'dropout': .5,}, 'sel_GSO': {'gso_type': 'A', 'norm': True}, 'model': GF_NodeClassModel,
     'mod_args': {'loss': LOSS_FN, 'K': 2}, 'train_args': {'epochs': 200, 'epochs_h': 5, 'epochs_W': 25, 'lr': .001, 'wd': 5e-4,
     'dropout': .5,},
     'leg': 'AFGNN-normH'},

    # ### Baselines
    {'arch': GFGCN, 'arc_args': {'diff_layer': GFGCN_noh_Layer, 'hid_dim': 64, 'n_layers': 2, 'K': 3, 'act': nn.Tanh(), 
     'l_act': LAST_ACT, 'dropout': .25,}, 'sel_GSO': {'gso_type': 'A', 'norm': True}, 'model': NodeClassModel,
     'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 500, 'lr': .01, 'wd': .01}, 'leg': 'W-GCN-norm'},

    # # GCNNs - Identity?
    {'arch': GCNN, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'norm': 'both'}, 'mod_args': {'loss': LOSS_FN},
     'sel_GSO': {'gso_type': 'A-dgl'}, 'model': NodeClassModel, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4}, 'leg': 'Kipf-norm'},
    {'arch': GCNN, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'norm': 'none'}, 'sel_GSO': {'gso_type': 'A-dgl'},
     'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4}, 'leg': 'Kipf'},
    # MLP - Identity?
    {'arch': MLP, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'n_layers': 3}, 'sel_GSO': {'gso_type': 'none'},
     'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4}, 'leg': 'MLP'},
    # GAT
    {'arch': GAT, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'num_heads': 2, 'gat_params': {'attn_drop': 0}},
     'sel_GSO': {'gso_type': 'A-dgl', 'self_loops': False}, 'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200,
     'lr': .01, 'wd': 5e-4}, 'leg': 'GAT'},
    {'arch': GraphSAGE, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'n_layers': 2,},
     'sel_GSO': {'gso_type': 'A-dgl', 'self_loops': False}, 'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4},
     'leg': 'SAGE'},
    {'arch': GIN, 'arc_args': {'hid_dim': 16, 'act': ACT, 'l_act': LAST_ACT, 'dropout': .5, 'n_layers': 2, 'mlp_layers': 2},
     'sel_GSO': {'gso_type': 'A-dgl', 'self_loops': False}, 'model': NodeClassModel, 'mod_args': {'loss': LOSS_FN}, 'train_args': {'epochs': 200, 'lr': .01, 'wd': 5e-4},
     'leg': 'GIN'},
]


accs_test_hom, accs_max_val_hom, ellapsed_times_hom = run_exp(EXPS, DATASETS, N_RUNS, device)

table_accs_hom = summary_table(accs_test_hom, DATASETS, EXPS)
table_accs_maxval_hom = summary_table(accs_max_val_hom, DATASETS, EXPS)
table_time_hom = summary_table(ellapsed_times_hom, DATASETS, EXPS)

TexasDataset
0: 0.730 (0.012) - WisconsinDataset
0: 0.549 (0.020) - TexasDataset (homophily ratio: 0.108)
	- AFGNN-normH:	mean: 0.838 - std: 0.0000 - med: 0.838 - time: 0.38 mins
	- AFGNN:	mean: 0.811 - std: 0.0000 - med: 0.811 - time: 0.28 mins
	- AFGNN-normA:	mean: 0.784 - std: 0.0000 - med: 0.784 - time: 0.16 mins
	- W-GCN-norm:	mean: 0.838 - std: 0.0000 - med: 0.838 - time: 0.02 mins
	- Kipf-norm:	mean: 0.622 - std: 0.0000 - med: 0.622 - time: 0.02 mins
	- Kipf:	mean: 0.595 - std: 0.0000 - med: 0.595 - time: 0.02 mins
	- MLP:	mean: 0.784 - std: 0.0000 - med: 0.784 - time: 0.01 mins
	- GAT:	mean: 0.703 - std: 0.0000 - med: 0.703 - time: 0.04 mins
	- SAGE:	mean: 0.838 - std: 0.0000 - med: 0.838 - time: 0.02 mins
	- GIN:	mean: 0.730 - std: 0.0000 - med: 0.730 - time: 0.01 mins

WisconsinDataset (homophily ratio: 0.196)
	- AFGNN-normH:	mean: 0.745 - std: 0.0000 - med: 0.745 - time: 0.31 mins
	- AFGNN:	mean: 0.863 - std: 0.0000 - med: 0.863 - time: 0.30 mins
	- AFGNN-normA:	mean: 0.824 

In [None]:
if SAVE:
    timestr = time.strftime("%Y%m%d-%H%M")
    table_accs_hom.to_csv(PATH + 'heterophilic' + timestr)

print_full_results(accs_test_hom, ellapsed_times_hom, DATASETS, EXPS)
display(table_accs_hom)