In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
from embedding_evaluation_framework import EmbeddingData, test_n2v


In [10]:
inits = 20
num_splits = 100

In [11]:
datasets = ['cora','citeseer','PubMed','cora_full']

In [12]:
def report_test_acc_unsupervised_embedding(cache_prefix,dataset,filename,method,speedup=False):
    tests = []
    for init in range(inits):
        print(f' init {init}')
        emb = EmbeddingData(f'/tmp/{cache_prefix}Emb{dataset.capitalize()}Init{init}',dataset,filename,directed=False,initialization=f'{method}/{init}')
        test = test_n2v(emb[0],num_splits=num_splits,speedup=speedup)
        tests = tests + test
    return tests

In [13]:
def load_and_combine_gat(method='gat'):
    dfs = []
    for f in os.listdir('../reports/results/eval/gat'):
        if f.startswith('gat') and ('PubMed' not in f) and f.endswith('csv') and f != 'gat.csv':
#             print(f)
            dfs.append(pd.read_csv('../reports/results/eval/gat/' + f))
    df = pd.concat(dfs,ignore_index=True)
    df.to_csv(f'../reports/results/eval/{method}.csv',index=False)
    return df

In [14]:
def load_and_combine(method):
    df_gcn_12_24 = pd.read_csv(f'eval/{method}_val_12_24.csv')
    df_gcn_48 = pd.read_csv(f'eval/{method}_val_48.csv')
    df_gcn_96 = pd.read_csv(f'eval/{method}_val_96.csv')
    df_gcn = pd.concat([df_gcn_12_24,df_gcn_48,df_gcn_96],ignore_index=True)
    df_gcn.to_csv(f'eval/{method}.csv',index=False)
    return df_gcn

In [15]:
def model_selection_gnn(df):
    return df[df.val_avg == df.groupby('arch').val_avg.transform(max)]
def model_selection(df):
    return df[df.val_avg == df.val_avg.max()]

In [16]:
hypercols = 'conv arch ch dropout heads lr  wd val_avg'.split()

In [18]:
model_selection_gnn(pd.read_csv('../reports/results/eval/sage_val_twitter_undirected.csv'))[hypercols]

Unnamed: 0,conv,arch,ch,dropout,heads,lr,wd,val_avg
502,SAGEConv,B,96,0.4,1,0.005,0.1,0.714533
507,SAGEConv,M,96,0.6,1,0.005,0.001,0.736133
524,SAGEConv,T,96,0.8,1,0.005,0.01,0.7208


In [19]:
model_selection_gnn(pd.read_csv('../reports/results/eval/gcn_val_twitter_undirected.csv'))[hypercols]

Unnamed: 0,conv,arch,ch,dropout,heads,lr,wd,val_avg
392,GCNConv,T,48,0.2,1,0.01,0.01,0.715067
538,GCNConv,B,96,0.2,1,0.01,0.1,0.709733
540,GCNConv,M,96,0.4,1,0.01,0.0001,0.7188


In [20]:
model_selection_gnn(pd.read_csv('../reports/results/eval/sage_val_webkb_undirected.csv'))[hypercols]

Unnamed: 0,conv,arch,ch,dropout,heads,lr,wd,val_avg
534,SAGEConv,M,96,0.2,1,0.01,0.01,0.389547
539,SAGEConv,T,96,0.2,1,0.01,0.1,0.39104
550,SAGEConv,B,96,0.4,1,0.01,0.1,0.393387


In [21]:
model_selection_gnn(pd.read_csv('../reports/results/eval/gcn_val_webkb_undirected.csv'))[hypercols]

Unnamed: 0,conv,arch,ch,dropout,heads,lr,wd,val_avg
537,GCNConv,M,96,0.2,1,0.01,0.1,0.41568
539,GCNConv,T,96,0.2,1,0.01,0.1,0.44224
562,GCNConv,B,96,0.6,1,0.01,0.1,0.449707


In [8]:
test_acc = pd.DataFrame(columns='method dataset test_acc test_avg test_std'.split())

In [9]:
test_acc

Unnamed: 0,method,dataset,test_acc,test_avg,test_std


In [10]:
df_n2v = pd.read_csv('eval/n2v.csv')
df_line1 = pd.read_csv('eval/line-1.csv')
df_line2 = pd.read_csv('eval/line-2.csv')
df_nerd = pd.read_csv('eval/nerd.csv')

In [11]:
model_selection(df_n2v)

Unnamed: 0,splits,inits,p,q,val_acc,val_avg,val_std
13,,,1.75,0.75,"[0.7285714285714285, 0.680952380952381, 0.7476...",0.730412,0.033786


In [None]:
p = 1.75
q = 0.75
for dataset in datasets:
    tests = report_test_acc_unsupervised_embedding(cache_prefix=f'n2vUndp{p}q{q}',dataset=dataset,filename=f'node2vec_rw_{p:0.2f}_{q:0.2f}',
                                                   method='node2vec',speedup=(dataset=='cora_full'))
    test_acc = test_acc.append({'method':'node2vec', 'dataset':dataset,
                    'test_acc':tests, 'test_avg':np.mean(tests), 'test_std':np.std(tests)},ignore_index=True)
    test_acc.to_csv('eval/test_acc.csv')

 init 0
 init 1


In [34]:
model_selection(df_line1)

Unnamed: 0,splits,inits,lr,neg,val_acc,val_avg,val_std
0,100,20,0.005,5,"[0.6619047619047619, 0.6095238095238096, 0.709...",0.677238,0.034454


In [35]:
model_selection(df_line2)

Unnamed: 0,splits,inits,lr,neg,val_acc,val_avg,val_std
0,100,20,0.005,5,"[0.7047619047619048, 0.6428571428571429, 0.738...",0.715667,0.032059


In [None]:
lr = 0.005
neg = 5
for dataset in datasets:
    tests = report_test_acc_unsupervised_embedding(cache_prefix=f'lineUndlr{lr}neg{neg}',dataset='cora',filename=f'line_{neg}_{lr}',method='line1',
                                                  speedup=(dataset=='cora_full'))
    test_acc = test_acc.append({'method':'line1', 'dataset':dataset,'test_acc':tests, 'test_avg':np.mean(tests), 'test_std':np.std(tests)})
    test_acc.to_csv('eval/test_acc.csv')
    tests = report_test_acc_unsupervised_embedding(cache_prefix=f'line2Undlr{lr}neg{neg}',dataset='cora',filename=f'line_{neg}_{lr}',method='line2',
                                                  speedup=(dataset=='cora_full'))
    test_acc = test_acc.append({'method':'line2', 'dataset':dataset,'test_acc':tests, 'test_avg':np.mean(tests), 'test_std':np.std(tests)})
    test_acc.to_csv('eval/test_acc.csv')

In [33]:
model_selection(df_nerd)

Unnamed: 0,splits,inits,type,lr,neg,val_acc,val_avg,val_std
20,100,20,hub,0.025,20,"[0.7238095238095238, 0.6904761904761905, 0.752...",0.74519,0.027754
21,100,20,aut,0.025,20,"[0.7238095238095238, 0.6904761904761905, 0.752...",0.74519,0.027754


In [None]:
lr = 0.025
neg = 20
for dataset in datasets:
    tests = report_test_acc_unsupervised_embedding(cache_prefix=f'nerdUndlr{lr}neg{neg}',dataset='cora',filename=f'line_{neg}_{lr}',method='nerd',
                                                   nerd='aut',speedup=(dataset=='cora_full'))
    test_acc = test_acc.append({'method':'nerd', 'dataset':dataset,'test_acc':tests, 'test_avg':np.mean(tests), 'test_std':np.std(tests)})
    test_acc.to_csv('eval/test_acc.csv')

In [21]:
df_gcn = load_and_combine('gcn')
df_sage = load_and_combine('sage')
df_gat = load_and_combine_gat()

In [29]:
model_selection_gnn(df_gcn)

Unnamed: 0,conv,arch,ch,dropout,lr,wd,heads,splits,inits,val_accs,val_avg,val_std,test_accs,test_avg,test_std,stopped,elapsed
570,GCNConv,M,96,0.8,0.01,0.01,1,100,20,"[0.819047619047619, 0.8095238095238095, 0.8047...",0.833674,0.024508,"[0.7981340118744699, 0.8044953350296862, 0.815...",0.799806,0.016325,"[19, 21, 13, 17, 15, 11, 14, 24, 13, 14, 22, 2...",2566.500325
571,GCNConv,B,96,0.8,0.01,0.01,1,100,20,"[0.8285714285714286, 0.8285714285714286, 0.804...",0.829974,0.023607,"[0.8176420695504665, 0.8104325699745547, 0.806...",0.796047,0.015225,"[21, 14, 12, 14, 17, 12, 22, 19, 11, 18, 21, 2...",2808.809885
572,GCNConv,T,96,0.8,0.01,0.01,1,100,20,"[0.8238095238095238, 0.8476190476190476, 0.809...",0.831295,0.023927,"[0.7947413061916879, 0.8155216284987278, 0.824...",0.797844,0.014636,"[19, 14, 12, 16, 12, 12, 14, 16, 22, 16, 15, 2...",2985.808164


In [30]:
model_selection_gnn(df_sage)

Unnamed: 0,conv,arch,ch,dropout,lr,wd,heads,splits,inits,val_accs,val_avg,val_std,test_accs,test_avg,test_std,stopped,elapsed
576,SAGEConv,M,96,0.8,0.01,0.01,1,100,20,"[0.8095238095238095, 0.8095238095238095, 0.804...",0.831898,0.024364,"[0.8100084817642069, 0.8011026293469041, 0.799...",0.797955,0.016773,"[11, 20, 16, 32, 16, 14, 13, 16, 23, 12, 17, 2...",4697.381454
577,SAGEConv,B,96,0.8,0.01,0.01,1,100,20,"[0.780952380952381, 0.8, 0.8095238095238095, 0...",0.828248,0.023407,"[0.7769296013570822, 0.7955894826123834, 0.799...",0.79178,0.016867,"[12, 11, 23, 19, 22, 16, 11, 21, 23, 11, 18, 1...",4911.934584
578,SAGEConv,T,96,0.8,0.01,0.01,1,100,20,"[0.8095238095238095, 0.780952380952381, 0.8, 0...",0.830402,0.023394,"[0.7968617472434266, 0.7748091603053435, 0.802...",0.794813,0.016802,"[11, 12, 18, 19, 15, 16, 12, 18, 22, 12, 18, 1...",5067.023544


In [31]:
model_selection_gnn(df_gat)

Unnamed: 0,conv,arch,ch,dropout,lr,wd,heads,splits,inits,val_accs,val_avg,val_std,test_accs,test_avg,test_std,stopped,elapsed
555,GATConv,M,96,0.4,0.01,0.01,4,100,20,"[0.8142857142857143, 0.819047619047619, 0.7809...",0.825867,0.027717,"[0.806615776081425, 0.8074639525021204, 0.8002...",0.789367,0.022377,"[18, 22, 18, 22, 23, 19, 36, 22, 25, 21, 19, 2...",2764.621085
592,GATConv,B,96,0.6,0.01,0.01,4,100,20,"[0.8142857142857143, 0.8, 0.7857142857142857, ...",0.822581,0.025466,"[0.8036471586089907, 0.8032230703986429, 0.716...",0.784573,0.021213,"[20, 24, 27, 16, 19, 21, 28, 22, 23, 17, 21, 1...",4486.007846
593,GATConv,T,96,0.6,0.01,0.01,4,100,20,"[0.7952380952380952, 0.8095238095238095, 0.809...",0.825079,0.025728,"[0.7964376590330788, 0.8159457167090755, 0.801...",0.78819,0.021055,"[16, 18, 18, 22, 24, 19, 19, 21, 19, 15, 19, 2...",4740.075585


In [None]:
a