In [1]:
import numpy as np

from os import listdir
from os.path import isfile, join
import networkx as nx
import networkx.algorithms as alg

import dgl
from dgl.nn import RelGraphConv

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np


from dgl.nn import GraphConv, AvgPooling
from dgl.nn.pytorch import Sequential

from dgl.dataloading import GraphDataLoader
from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split,SubsetRandomSampler, ConcatDataset
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold, StratifiedKFold

import matplotlib.pyplot as plt

from torch.optim import lr_scheduler

import pickle

from copy import deepcopy

In [2]:
pickle_name = 'nx_url_dataset.pickle'
with open(pickle_name, 'rb') as f:
    nx_train_set, nx_test_set = pickle.load(f)

In [3]:
e2id = {'title':0, 
        'link':1,
        'reversed_title':2,
        'reversed_link':3,
        'domain':4
       }

def add_encoding(dataset):
    for j in range(len(dataset)):
        g = dataset[j][0]
        
        in_degree = nx.algorithms.in_degree_centrality(g)
        out_degree = nx.algorithms.out_degree_centrality(g)
        closeness = nx.algorithms.closeness_centrality(g)
        clustering = nx.algorithms.clustering(g)
        reversesed_graph = g.reverse(copy=True)
        eig = nx.algorithms.eigenvector_centrality(reversesed_graph, max_iter=10000)

        feats = np.asarray([list(in_degree.values()),
                            list(out_degree.values()),
                            list(closeness.values()),
                            list(eig.values()),
                            list(clustering.values())
                           ])
        mean = feats.mean(1)
        std = feats.std(1)

        for e in g.edges:
#             print(g.edges[e[0], e[1]]['link_type'])
            if g.edges[e[0], e[1]]['link_type'] == 'title':
                g.edges[e[0], e[1]]['type'] = e2id['title']
                g.add_edge(e[1], e[0])
                g.edges[e[1], e[0]]['type'] = e2id['reversed_title']
            
                
            if g.edges[e[0], e[1]]['link_type'] == 'link':
                g.edges[e[0], e[1]]['type'] = e2id['link']
                g.add_edge(e[1], e[0])
                g.edges[e[1], e[0]]['type'] = e2id['reversed_link']
                
        node_domains = dict()
        for i in range(len(g.nodes)):
#             g.nodes[i]['h'] = torch.tensor([])
            feats = np.asarray([in_degree[i], out_degree[i], closeness[i], eig[i], clustering[i]])
            feats -= mean
            feats /= (std + 1e-6)
            g.nodes[i]['h'] = torch.tensor(feats)
#             g.nodes[i]['h'][g.nodes[i]['level']] = 1.
            if g.nodes[i]['domain'] not in node_domains:
                node_domains[g.nodes[i]['domain']] = [i]
            else:
                node_domains[g.nodes[i]['domain']].append(i)
            
        for domain in node_domains:
            if len(node_domains[domain]) >1:
                for d1 in node_domains[domain]:
                    for d2 in node_domains[domain]:
                        if d1 == d2:
                            continue
                        g.add_edge(d1, d2)
                        g.edges[d1, d2]['type'] = e2id['domain']
                        
    return dataset
train_set = add_encoding(nx_train_set)
test_set = add_encoding(nx_test_set)

In [4]:
train_graphs = [(dgl.from_networkx(g, node_attrs=["h"], edge_attrs=['type']), torch.tensor([l]).float()) for g, l in train_set]
train_labels = [l for _, l in train_set]
test_graphs = [(dgl.from_networkx(g, node_attrs=['h'], edge_attrs=['type']), torch.tensor([l]).float()) for g, l in test_set]
test_labels = [l for _, l in test_set]


In [5]:
g = [g for g,_ in train_graphs]
g = dgl.batch(g)


In [6]:
class RGCN(nn.Module):
    def __init__(self, in_features, hidden_features, dropout, num_edge_types):
        super(RGCN, self).__init__()
        self.in_layer = RelGraphConv(in_features, hidden_features, num_edge_types, activation=nn.ReLU())
        self.dropout1 = nn.Dropout(dropout)
        self.out_layer = RelGraphConv(hidden_features, hidden_features, num_edge_types, activation=nn.ReLU())
        self.dropout2 = nn.Dropout(dropout)
        self.global_mean_pool = dgl.nn.pytorch.glob.AvgPooling()
        
        self.fc = nn.Linear(hidden_features, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, g, in_feat, e_types):
        h = self.in_layer(g, in_feat, e_types)
        h = self.dropout1(h)
        h = self.out_layer(g, h, e_types)
        h = self.dropout2(h)
        h = self.global_mean_pool(g, h)
        h = self.fc(h)
        
        return self.sigmoid(h)

In [7]:
def evaluate(model, dataloader):
    model.eval()
    num_correct = 0
    num_tests = 0
    for batched_graph, labels in dataloader:
        pred = torch.round(model(batched_graph, batched_graph.ndata['h'].float(), batched_graph.edata['type']))
        num_correct += (pred==labels).sum().float().item()
        num_tests += len(labels)
    return num_correct / num_tests

def train(model, train_loader, optimizer):
    Loss = nn.BCELoss()
    
    model.train()
    ret_loss = 0
    n_graphs = 0
    for data, target in train_loader:
        output = model(data, data.ndata['h'].float(), data.edata['type'])
        
        loss = Loss(output, target)
        ret_loss += loss.item()/len(target)
        n_graphs += len(target)     
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        return loss.item()
#     return ret_loss/n_graphs

def val_loss(model, loader):
    Loss = nn.BCELoss()
    model.eval()
    ret_loss = 0
    n_graphs = 0
    with torch.no_grad():
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            output = model(data, data.ndata['h'].float(), data.edata['type'])
            loss = Loss(output, target)

            return loss.item()
        


In [8]:
# model.in_layer.linear_r

In [9]:
splits=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
split_idxs = splits.split(np.arange(len(train_graphs)), train_labels)

num_epochs=100
from itertools import product
hidden_size = [4, 16, 32]
dropout = [0., 0.5, 0.7]

lr = [0.05, 0.01, 0.005]

tg = dgl.batch([g for g,_ in train_graphs])

model_params = list(product(hidden_size, dropout, lr))

in_feature_size = train_graphs[0][0].ndata['h'].shape[1]
num_edge_types = tg.edata['type'].max().item() +1
print(num_edge_types)

# model_params = [[4, 0., 0.1]]
# print(model_params)

5


In [10]:
# for l in splits.split(np.arange(len(train_graphs)), train_labels):
#     print(train_labels[l])

In [None]:
best_test_acc = 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
results = []
for i, (hidden_size, dropout, lr) in enumerate(model_params):
    foldperf = {}
    print(f"model {i}/{len(model_params)}")
    print('Model parameters: hidden_size {}, dropout {}, optimizer lr {}'.format(*model_params[i]))
    
    for fold, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(train_graphs)), train_labels)):
        model = RGCN(in_feature_size, hidden_size, dropout, num_edge_types)

        lrs = []


        train_sampler = SubsetRandomSampler(train_idx)
        test_sampler = SubsetRandomSampler(val_idx)
        train_loader = GraphDataLoader(train_graphs, batch_size=1000, sampler=train_sampler)
        test_loader = GraphDataLoader(train_graphs, batch_size=1000, sampler=test_sampler)

        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        history = {'train_loss': [], 'test_loss': [],'train_acc':[],'test_acc':[]}

        for epoch in range(num_epochs):
            train_loss = train(model,train_loader, optimizer)
            train_acc = evaluate(model, train_loader)
            test_loss = val_loss(model,test_loader)
            test_acc = evaluate(model, test_loader)
#             if epoch%10==1:
#                 print("{} Loss train/val:{:.3f} / {:.3f} Acc train/val {:.2f} / {:.2f} %".format(epoch, train_loss,
#                                                                                               test_loss,
#                                                                                               100*train_acc,
#                                                                                               100*test_acc))

            history['train_loss'].append(train_loss)
            history['test_loss'].append(test_loss)
            history['train_acc'].append(train_acc)
            history['test_acc'].append(test_acc)
        foldperf[f'fold{fold+1}'] = history  
    results.append(foldperf)
#     break


model 0/27
Model parameters: hidden_size 4, dropout 0.0, optimizer lr 0.05




model 1/27
Model parameters: hidden_size 4, dropout 0.0, optimizer lr 0.01




model 2/27
Model parameters: hidden_size 4, dropout 0.0, optimizer lr 0.005




model 3/27
Model parameters: hidden_size 4, dropout 0.5, optimizer lr 0.05




model 4/27
Model parameters: hidden_size 4, dropout 0.5, optimizer lr 0.01




model 5/27
Model parameters: hidden_size 4, dropout 0.5, optimizer lr 0.005




model 6/27
Model parameters: hidden_size 4, dropout 0.7, optimizer lr 0.05




model 7/27
Model parameters: hidden_size 4, dropout 0.7, optimizer lr 0.01




model 8/27
Model parameters: hidden_size 4, dropout 0.7, optimizer lr 0.005




model 9/27
Model parameters: hidden_size 16, dropout 0.0, optimizer lr 0.05




model 10/27
Model parameters: hidden_size 16, dropout 0.0, optimizer lr 0.01




model 11/27
Model parameters: hidden_size 16, dropout 0.0, optimizer lr 0.005




model 12/27
Model parameters: hidden_size 16, dropout 0.5, optimizer lr 0.05




model 13/27
Model parameters: hidden_size 16, dropout 0.5, optimizer lr 0.01




model 14/27
Model parameters: hidden_size 16, dropout 0.5, optimizer lr 0.005




model 15/27
Model parameters: hidden_size 16, dropout 0.7, optimizer lr 0.05




model 16/27
Model parameters: hidden_size 16, dropout 0.7, optimizer lr 0.01




model 17/27
Model parameters: hidden_size 16, dropout 0.7, optimizer lr 0.005




model 18/27
Model parameters: hidden_size 32, dropout 0.0, optimizer lr 0.05




model 19/27
Model parameters: hidden_size 32, dropout 0.0, optimizer lr 0.01




In [None]:
for i, foldperf in enumerate(results):
    train_loss = np.vstack([foldperf[fold]['train_loss'] for fold in foldperf])
    test_loss = np.vstack([foldperf[fold]['test_loss'] for fold in foldperf])
    train_acc = np.vstack([foldperf[fold]['train_acc'] for fold in foldperf])
    test_acc = np.vstack([foldperf[fold]['test_acc'] for fold in foldperf])
    
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(9, 9))
    fig.suptitle(f'Model {i}, hidden_size {model_params[i][0]}, dropout {model_params[i][1]}, lr {model_params[i][2]}')
    ax1.set_xlabel("Epoch")
    ax1.set_ylabel("Loss")
    ax1.title.set_text('Training loss')
    ax1.fill_between(range(num_epochs), train_loss.min(0), train_loss.max(0), alpha=0.3, label="train loss <min; max>")
    ax1.fill_between(range(num_epochs), test_loss.min(0), test_loss.max(0), alpha=0.3, label="test loss <min; max>")

    ax1.plot(range(num_epochs), train_loss.mean(0), label='train loss mean')
    ax1.plot(range(num_epochs), test_loss.mean(0), label='test loss mean')
    ax1.legend()
    
    
    
    ax2.title.set_text("Training loss with cross validation")
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel("Accuracy")
    ax2.fill_between(range(num_epochs), train_acc.min(0), train_acc.max(0), alpha=0.2, label="train acc <min; max>")
    ax2.fill_between(range(num_epochs), test_acc.min(0), test_acc.max(0), alpha=0.2, label="test acc <min; max>")

    ax2.plot(range(num_epochs), train_acc.mean(0))

    ax2.plot(range(num_epochs), test_acc.mean(0))
    ax2.legend()

    
    
    ax3.title.set_text('Train loss per fold')
    for l in train_loss:
        ax3.plot(l)
#     for l in train_loss:
#         ax3.plot(l, 'b-', alpha=0.5)
#     for l in test_loss:
#         ax3.plot(l, 'r-', alpha=0.5)
#     ax3.plot(np.convolve(train_loss.mean(0), np.ones(20)/20, mode='valid'), label='train loss')
#     ax3.plot(np.convolve(test_loss.mean(0), np.ones(20)/20, mode='valid'), label='test loss')
    ax3.set_xlabel('Epoch')
    ax3.set_ylabel('Loss')

    ax3.legend()
    
    ax4.title.set_text('Validation loss per fold')
    for l in test_loss:
        ax4.plot(l)
#     ax3.plot(np.convolve(train_loss.mean(0), np.ones(20)/20, mode='valid'), label='train loss')
#     ax3.plot(np.convolve(test_loss.mean(0), np.ones(20)/20, mode='valid'), label='test loss')
    ax4.set_ylabel('Accuracy')
    ax4.set_xlabel('Epoch')
    plt.show()


In [None]:
accs = []
for i, foldperf in enumerate(results):
    accs.append(np.vstack([foldperf[fold]['test_acc'][-1] for fold in foldperf]).mean(0))
best_params = model_params[np.asarray(accs).argmax()]



In [None]:
hidden_size, dropout, lr = best_params
model = RGCN(hidden_size, dropout, num_edge_types)

train_loader = GraphDataLoader(train_graphs, batch_size=1000)
# test_loader = GraphDataLoader(train_graphs, batch_size=1000)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
history = {'train_loss': [], 'test_loss': [],'train_acc':[],'test_acc':[]}
train_loss = []
train_acc = []

for epoch in range(num_epochs):
    loss = train(model,train_loader,optimizer)
    acc = evaluate(model, train_loader)



    train_loss.append(loss)
    train_acc.append(acc)


In [None]:
plt.figure()
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(10, 5))
plt.suptitle('Training loss and acc for the best model')
ax1.set_xlabel('eppch')
ax1.set_ylabel('loss')
ax1.plot(train_loss)

ax2.set_xlabel('epoch')
ax2.set_ylabel('Acc [%]')
ax2.plot(train_acc)

plt.show()

In [None]:
###### test_loader = GraphDataLoader(test_graphs, batch_size=1000)

print('best params: hidden size {}, dropout {}, Adam learning rate{}'.format(*best_params))
acc = evaluate(model, test_loader)
print(f'Model test acc is {acc*100}%')