In [1]:
import networkx as nx
import grakel
import pickle

import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
import gklearn.kernels as kernels
from sklearn import svm
from grakel.utils import graph_from_networkx
from grakel.kernels import WeisfeilerLehman, VertexHistogram

from scipy import stats
from utils import *

In [2]:
pickle_name = 'nx_dataset.pickle'
with open(pickle_name, 'rb') as f:
    nx_train_set, nx_val_set, nx_test_set = pickle.load(f)


nx_train_set = add_backward_edges(nx_train_set)
nx_val_set = add_backward_edges(nx_val_set)
nx_test_set = add_backward_edges(nx_test_set)

nx_train_set = add_domain(nx_train_set)
nx_val_set = add_domain(nx_val_set)
nx_test_set = add_domain(nx_test_set)

In [3]:
train_set = [g for g,_ in nx_train_set]
val_set = [g for g,_ in nx_val_set]
test_set = [g for g,_ in nx_test_set]

    
train_labels = np.asarray([l for _,l in nx_train_set])
val_labels = np.asarray([l for _,l in nx_val_set])
test_labels = np.asarray([l for _,l in nx_test_set])


In [4]:
def modify_graphs(dataset):
    for g in dataset:
        for e in g.edges:
            print(e)
            g.add_edge(e[1], e[0])
    return dataset

In [5]:
splits=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)


In [6]:
params = [
#           ({'name':'shortest_path', 'with_labels':True}, True),
          ({'name':'random_walk', 'with_labels':False}, True),
          ({'name':'random_walk', 'with_labels':True}, True),
          ({'name':'weisfeiler_lehman_optimal_assignment'},  True),
          ({'name':'weisfeiler_lehman'},  True),
          ({'name':'hadamard_code'},  True),
         ]
results = np.zeros((2, 5, len(params)))
knn_results = np.zeros((5, len(params)))
for i, (p, normalize) in enumerate(params):
    kernel = grakel.GraphKernel(p)
    print(kernel.kernel, " kernel noramlize ", normalize)
    
    for j, (trn, val) in enumerate(splits.split(train_set, train_labels)):
        trn_set = graph_from_networkx([train_set[i] for i in trn], 'level')
        trn_labels = np.asarray([train_labels[i] for i in trn])
        split_val_set = graph_from_networkx([train_set[i] for i in val], 'level')
        split_val_labels = np.asarray([train_labels[i] for i in val])
        kernel = grakel.GraphKernel(p, normalize=normalize)
        
        kernel_train = kernel.fit_transform(trn_set)
                                            
        kernel_val = kernel.transform(split_val_set)
        
        svc = svm.SVC(kernel='precomputed')
        svc.fit(kernel_train, trn_labels)
        new_labels = svc.predict(kernel_train)

        train_accuracy = (new_labels == trn_labels).mean()

        new_labels = svc.predict(kernel_val)

        val_accuracy = (new_labels == split_val_labels).mean()
        results[0, j, i] = train_accuracy
        results[1, j, i] = val_accuracy
        print("Training accuracy: {:.2f}%, validation acc {:.2f}%".format(100*train_accuracy, 100*val_accuracy))
    

{'name': 'random_walk', 'with_labels': False}  kernel noramlize  True


  return km / np.sqrt(np.outer(self._X_diag, self._X_diag))
  km /= np.sqrt(np.outer(Y_diag, X_diag))


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
best_params_id = results[1].mean(0).argmax()
print(f'best model is: {params[best_params_id]} with mean validation acc {100*results[1].mean(0).max()} %')
best_model, normalize = params[best_params_id]

In [None]:
# weisfeiler_lehman_optimal_assignment is found by the CV, I did grid search manually 
new_params = [
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 0.1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 0.1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 0.1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 0.1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 0.5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 0.5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 0.5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 0.5),
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 2), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 2), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 2), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 2), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 10), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 10), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 10), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 10),
]

In [None]:
results = np.zeros((2, 5, len(new_params)))
for i, (p, normalize, c) in enumerate(new_params):
    print(kernel.kernel, " kernel noramlize ", normalize, "C: ", c)
    
    for j, (trn, val) in enumerate(splits.split(train_set, train_labels)):
        trn_set = graph_from_networkx([train_set[i] for i in trn], 'level')
        trn_labels = np.asarray([train_labels[i] for i in trn])
        val_set = graph_from_networkx([train_set[i] for i in val], 'level')
        val_labels = np.asarray([train_labels[i] for i in val])
        kernel = grakel.GraphKernel(p, normalize=normalize)
        
        kernel_train = kernel.fit_transform(trn_set)

        kernel_val = kernel.transform(val_set)
        
        svc = svm.SVC(kernel='precomputed', C=c)
        svc.fit(kernel_train, trn_labels)
        new_labels = svc.predict(kernel_train)

        train_accuracy = (new_labels == trn_labels).mean()

        new_labels = svc.predict(kernel_val)

        val_accuracy = (new_labels == val_labels).mean()
        results[0, j, i] = train_accuracy
        results[1, j, i] = val_accuracy
        print("Training accuracy: {:.2f}%, validation acc {:.2f}%".format(100*train_accuracy, 100*val_accuracy))
    

In [None]:
best_params_id = results[1].mean(0).argmax()
best_mean_val = results[1].mean(0).max()
best_params = new_params[best_params_id]
print('best parameters are {}, C {} which achieved {:.2f}% mean val acc'.format(best_params[0], best_params[2], best_mean_val))

In [None]:
trn_set = graph_from_networkx([g for g in train_set], 'level')
val_set = graph_from_networkx([g for g in val_set], 'level')

tst_set = graph_from_networkx([g for g in test_set], 'level')
kernel = grakel.GraphKernel(best_params[0], normalize=best_params[1])
kernel_train = kernel.fit_transform(trn_set)
kernel_test = kernel.transform(tst_set)

svc = svm.SVC(kernel='precomputed', C=best_params[2])
svc.fit(kernel_train, train_labels)
prediction = svc.predict(kernel_test)
test_accuracy = (prediction == test_labels).mean()
print('best model test acc is {:.2f}%'.format(100*test_accuracy))