In [1]:
import torch
from graphvite.application.network import LinkPredictor

def get_score(model, H, T):
    '''
    Get the un-normalized score (similarity) for `H` and `T` 
    within the given `model`
    '''
    model = LinkPredictor('LINE', model.solver.vertex_embeddings, model.solver.context_embeddings)
    model = model.cuda()
    tensorH = torch.as_tensor(H).cuda()
    tensorT = torch.as_tensor(T).cuda()
    return model(tensorH, tensorT)

1.3.1


-----------------------------------------------
### Used for calculating normalized possiblity (Kaggle competition)

In [2]:
import pickle

dim = 64
length = 20
method = 'deepwalk'
directed = False
emb = f'embedding/dim{dim}-len{length}/{"directed_" if directed else ""}{method}_full.pkl'

model = pickle.load(open(emb, 'rb'))

In [3]:
H, T = [], []
with open('data/test-public.txt', 'r') as f:
    next(f)
    mapping = model.graph.name2id
    for line in f:
        _, h, t = line.split()
        H.append(mapping[h])
        T.append(mapping[t])

In [4]:
score = get_score(model, H, T)
# normalize the scores to probabilities
smin = score.min(0, keepdim=True)[0]
smax = score.max(0, keepdim=True)[0]
score -= smin
score /= (smax - smin)

In [5]:
# save the output
out = f'output/dim{dim}-len{length}_{"" if directed else "un"}directed_{method}.csv'
with open(out, 'w') as o:
    o.write('Id,Predicted\n')
    for i, s in enumerate(score.tolist()):
        o.write(f'{i+1},{s}\n')
print('Write output to', out)

Write output to output/dim64-len20_undirected_deepwalk_regularized.csv


---
### Used for calculating AUC (evaluate different models)


In [2]:
import pickle
model = pickle.load(open('embedding/dim32-len40/deepwalk_train8.pkl', 'rb'))

In [9]:
H, T, Y = [], [], []
with open('data/test2.txt', 'r') as f:
    mapping = model.graph.name2id
    for line in f:
        h, t, y = line.split()
        H.append(mapping[h])
        T.append(mapping[t])
        Y.append(int(y))

# # filter testing data -- more robust approach
# with open('data/test2.txt', 'r') as f:
#     mapping = model.graph.name2id
#     omitted = 0
#     for line in f:
#         h, t, y = line.split()
#         if h in mapping and t in mapping:
#             H.append(mapping[h])
#             T.append(mapping[t])
#             Y.append(int(y))
#         else:
#             omitted += 1
            
#     # MAGIC_NUMBER: This is # un-matched edges due to random generated testing data
#     assert omitted == 1765609

In [12]:
L = torch.as_tensor(Y).cuda()
score = get_score(model, H, T)
order = torch.argsort(score, descending=True)
L = L[order]
hit = torch.cumsum(L, dim=0)
all = torch.sum(L == 0) * torch.sum(L == 1)
auc = torch.sum(hit[L == 0]).item() / all.item()
print(f'AUC: {auc}')

AUC: 0.9387663497194524
