In [1]:
import networkx as nx
import grakel
import pickle

import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
import gklearn.kernels as kernels
from sklearn import svm
from grakel.utils import graph_from_networkx
from grakel.kernels import WeisfeilerLehman, VertexHistogram

from scipy import stats


In [2]:
pickle_name = 'nx_domain_dataset.pickle'
with open(pickle_name, 'rb') as f:
    nx_train_set, nx_test_set = pickle.load(f)

manual_normalization = False
undirected_graphs = False
domains=False

def add_domains(dataset):
    for j in range(len(dataset)):
        g = dataset[j][0]
        
        node_domains = dict()
        for i in range(len(g.nodes())):
            if g.nodes[i]['domain'] not in node_domains:
                node_domains[g.nodes[i]['domain']] = [i]
            else:
                node_domains[g.nodes[i]['domain']].append(i)
            
        for domain in node_domains:
            if len(node_domains[domain]) >1:
                for d1 in node_domains[domain]:
                    for d2 in node_domains[domain]:
                        if d1 == d2:
                            continue
                        g.add_edge(d1, d2)
    return dataset
if domains:
    nx_train_set = add_domains(nx_train_set)
    nx_test_set = add_domains(nx_test_set)

In [3]:

if undirected_graphs:
    train_set = [g.to_undirected() for g,_ in nx_train_set]
    test_set = [g.to_undirected() for g,_ in nx_test_set]
else:
    train_set = [g for g,_ in nx_train_set]
    test_set = [g for g,_ in nx_test_set]

    
train_labels = np.asarray([l for _,l in nx_train_set])
test_labels = np.asarray([l for _,l in nx_test_set])


In [4]:
def modify_graphs(dataset):
    for g in dataset:
        for e in g.edges:
            print(e)
            g.add_edge(e[1], e[0])
    return dataset

In [5]:
splits=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)


if undirected_graphs:
    train_graphs = graph_from_networkx([g.to_undirected() for g,_ in nx_train_set], 'level')
    test_graphs = graph_from_networkx([g.to_undirected() for g,_ in nx_test_set], 'level')
else:
    train_graphs = graph_from_networkx([g for g,_ in nx_train_set], 'level')
    test_graphs = graph_from_networkx([g for g,_ in nx_test_set], 'level')
    

In [6]:
k=5
def knn_classification(matrix, train_labels, test_labels):
    args = np.argsort(matrix, 1)[:,k:]
    prediction = stats.mode(train_labels[args], 1)[0].reshape(-1)
    return (prediction == test_labels).mean()

In [7]:
params = [
#           ({'name':'shortest_path', 'with_labels':False}, False),
#           ({'name':'shortest_path', 'with_labels':True}, False),
#           ({'name':'graphlet_sampling'}, False),
          ({'name':'random_walk', 'with_labels':False}, False),
          ({'name':'random_walk', 'with_labels':True}, False),
          ({'name':'weisfeiler_lehman_optimal_assignment'},  False),
          ({'name':'weisfeiler_lehman_optimal_assignment'},  True),
    
    
          ({'name':'weisfeiler_lehman'},  False),
          ({'name':'weisfeiler_lehman'},  True),
          
          ({'name':'hadamard_code'},  False),
          ({'name':'hadamard_code'},  True),
         ]
results = np.zeros((2, 5, len(params)))
knn_results = np.zeros((5, len(params)))
for i, (p, normalize) in enumerate(params):
    kernel = grakel.GraphKernel(p)
    print(kernel.kernel, " kernel noramlize ", normalize)
    
    for j, (trn, val) in enumerate(splits.split(train_set, train_labels)):
        trn_set = graph_from_networkx([train_set[i] for i in trn], 'level')
        trn_labels = np.asarray([train_labels[i] for i in trn])
        val_set = graph_from_networkx([train_set[i] for i in val], 'level')
        val_labels = np.asarray([train_labels[i] for i in val])
        kernel = grakel.GraphKernel(p, normalize=normalize)
        
        kernel_train = kernel.fit_transform(trn_set)
                                            
        kernel_val = kernel.transform(val_set)
        
        # attempts to downscale kernel values
        if manual_normalization:
#             kernel_train -= kernel_train.min()
#             kernel_train /= kernel_train.max()+1e-3


#             kernel_val -= kernel_val.min()
#             kernel_val /= kernel_val.max()+1e-3
            
            kernel_train = 1/(1+np.exp(-kernel_train))
            kernel_val = 1/(1+np.exp(-kernel_val))
        
        svc = svm.SVC(kernel='precomputed')
        svc.fit(kernel_train, trn_labels)
        new_labels = svc.predict(kernel_train)

        train_accuracy = (new_labels == trn_labels).mean()

        new_labels = svc.predict(kernel_val)

        val_accuracy = (new_labels == val_labels).mean()
        results[0, j, i] = train_accuracy
        results[1, j, i] = val_accuracy
        knn_results[j, i] = knn_classification(kernel_val, trn_labels, val_labels)
#         print("knn accuracy {:.2f}%".format(knn_results[j,i]))
        print("Training accuracy: {:.2f}%, validation acc {:.2f}%".format(100*train_accuracy, 100*val_accuracy))
    

{'name': 'random_walk', 'with_labels': False}  kernel noramlize  False
Training accuracy: 38.46%, validation acc 47.50%
Training accuracy: 61.15%, validation acc 66.67%
Training accuracy: 61.15%, validation acc 61.54%
Training accuracy: 63.69%, validation acc 69.23%
Training accuracy: 39.49%, validation acc 43.59%
{'name': 'random_walk', 'with_labels': True}  kernel noramlize  False
Training accuracy: 66.03%, validation acc 57.50%
Training accuracy: 63.69%, validation acc 66.67%
Training accuracy: 59.87%, validation acc 61.54%
Training accuracy: 65.61%, validation acc 74.36%
Training accuracy: 64.97%, validation acc 69.23%
{'name': 'weisfeiler_lehman_optimal_assignment'}  kernel noramlize  False
Training accuracy: 86.54%, validation acc 65.00%
Training accuracy: 85.99%, validation acc 71.79%
Training accuracy: 85.35%, validation acc 74.36%
Training accuracy: 85.35%, validation acc 61.54%
Training accuracy: 85.99%, validation acc 74.36%
{'name': 'weisfeiler_lehman_optimal_assignment'}  

In [8]:
best_params_id = results[1].mean(0).argmax()
print(f'best model is: {params[best_params_id]} with mean validation acc {100*results[1].mean(0).max()} %')
best_model, normalize = params[best_params_id]

best model is: ({'name': 'weisfeiler_lehman_optimal_assignment'}, False) with mean validation acc 69.41025641025641 %


In [9]:
new_params = [
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 0.1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 0.1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 0.1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 0.1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 0.5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 0.5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 0.5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 0.5),
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 1), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 2), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 2), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 2), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 2), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 5), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':1}, True, 10), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':2}, True, 10), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':3}, True, 10), 
    ({'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter':5}, True, 10),
]

In [None]:
results = np.zeros((2, 5, len(new_params)))
for i, (p, normalize, c) in enumerate(new_params):
    print(kernel.kernel, " kernel noramlize ", normalize, "C: ", c)
    
    for j, (trn, val) in enumerate(splits.split(train_set, train_labels)):
        trn_set = graph_from_networkx([train_set[i] for i in trn], 'level')
        trn_labels = np.asarray([train_labels[i] for i in trn])
        val_set = graph_from_networkx([train_set[i] for i in val], 'level')
        val_labels = np.asarray([train_labels[i] for i in val])
        kernel = grakel.GraphKernel(p, normalize=normalize)
        
        kernel_train = kernel.fit_transform(trn_set)

        kernel_val = kernel.transform(val_set)
        
        # attempts to downscale kernel values
        if manual_normalization:
#             kernel_train -= kernel_train.min()
#             kernel_train /= kernel_train.max()+1e-3


#             kernel_val -= kernel_val.min()
#             kernel_val /= kernel_val.max()+1e-3
            
            kernel_train = 1/(1+np.exp(-kernel_train))
            kernel_val = 1/(1+np.exp(-kernel_val))
        
        svc = svm.SVC(kernel='precomputed', C=c)
        svc.fit(kernel_train, trn_labels)
        new_labels = svc.predict(kernel_train)

        train_accuracy = (new_labels == trn_labels).mean()

        new_labels = svc.predict(kernel_val)

        val_accuracy = (new_labels == val_labels).mean()
        results[0, j, i] = train_accuracy
        results[1, j, i] = val_accuracy
#         print("knn accuracy {:.2f}%".format(knn_results[j,i]))
        print("Training accuracy: {:.2f}%, validation acc {:.2f}%".format(100*train_accuracy, 100*val_accuracy))
    

{'name': 'hadamard_code'}  kernel noramlize  True C:  0.1
Training accuracy: 68.59%, validation acc 62.50%
Training accuracy: 67.52%, validation acc 66.67%
Training accuracy: 64.33%, validation acc 69.23%
Training accuracy: 66.88%, validation acc 69.23%
Training accuracy: 65.61%, validation acc 64.10%
{'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter': 1}  kernel noramlize  True C:  0.1
Training accuracy: 68.59%, validation acc 62.50%
Training accuracy: 67.52%, validation acc 66.67%
Training accuracy: 66.24%, validation acc 71.79%
Training accuracy: 66.88%, validation acc 69.23%
Training accuracy: 67.52%, validation acc 66.67%
{'name': 'weisfeiler_lehman_optimal_assignment', 'n_iter': 2}  kernel noramlize  True C:  0.1
Training accuracy: 68.59%, validation acc 62.50%
Training accuracy: 67.52%, validation acc 66.67%
Training accuracy: 66.24%, validation acc 71.79%
Training accuracy: 66.88%, validation acc 69.23%
Training accuracy: 67.52%, validation acc 66.67%
{'name': 'weisfeile

In [None]:
best_params_id = results[1].mean(0).argmax()
best_mean_val = results[1].mean(0).max()
best_params = new_params[best_params_id]
print('best parameters are {}, C {} which achieved {:.2f}% mean val acc'.format(best_params[0], best_params[2], best_mean_val))

In [None]:
trn_set = graph_from_networkx([g for g in train_set], 'level')
tst_set = graph_from_networkx([g for g in test_set], 'level')
kernel = grakel.GraphKernel(best_params[0], normalize=best_params[1])
kernel_train = kernel.fit_transform(trn_set)
kernel_test = kernel.transform(tst_set)

if manual_normalization:
    kernel_train = 1/(1+np.exp(-kernel_train))
    kernel_test = 1/(1+np.exp(-kernel_val))
    
svc = svm.SVC(kernel='precomputed', C=best_params[2])
svc.fit(kernel_train, train_labels)
prediction = svc.predict(kernel_test)
test_accuracy = (prediction == test_labels).mean()
print('best model test acc is {:.2f}%'.format(100*test_accuracy))

In [None]:
# just for fun we cna evaluate every kernel on the test set

for i, (p, normalize) in enumerate(params):
    trn_set = graph_from_networkx([g for g in train_set], 'level')
    tst_set = graph_from_networkx([g for g in test_set], 'level')


    kernel = grakel.GraphKernel(p)
    print(kernel.kernel, " kernel noramlize ", normalize,)
    kernel = grakel.GraphKernel(p, normalize=normalize)
        
    kernel_train = kernel.fit_transform(trn_set)

    kernel_test = kernel.transform(tst_set)
        
        # attempts to downscale kernel values
    if manual_normalization:
#             kernel_train -= kernel_train.min()
#             kernel_train /= kernel_train.max()+1e-3
#             kernel_val -= kernel_val.min()
#             kernel_val /= kernel_val.max()+1e-3

        kernel_train = 1/(1+np.exp(-kernel_train))
        kernel_test = 1/(1+np.exp(-kernel_val))

    svc = svm.SVC(kernel='precomputed')
    svc.fit(kernel_train, train_labels)
    new_labels = svc.predict(kernel_train)

    train_accuracy = (new_labels == train_labels).mean()

    new_labels = svc.predict(kernel_test)

    test_accuracy = (new_labels == test_labels).mean()

    print("Training accuracy: {:.2f}%, Test acc {:.2f}%".format(100*train_accuracy, 100*test_accuracy))
    