In [1]:
import collections
import os
import pickle
import random
import numpy as np
import math
import torch

In [2]:
def get_sets(path):
    """
    Parses the raw FB text files and returns the set of all entities(their machine ids) and relations
    
    !!!The entities are sorted before assigning them to an index
    
    Input: path object
    
    Output: dictionaries of unique integer index to each entity and relation, set of edges for each split
    """
    entities, relations = set(), set()
    edge_set = {}
    for split in ["train.txt", "valid.txt", "test.txt"]:
        with open(os.path.join(path, split), "r") as lines:
            edges = set()
            for line in lines:
                lhs, rel, rhs = line.strip().split("\t")
                entities.add(lhs)
                entities.add(rhs)
                relations.add(rel)
                edges.add((lhs,rel,rhs))
        edge_set[split] = edges
    
    ent_id = {k:i for (i,k) in enumerate(sorted(entities))}
    rel_id = {k:i for (i,k) in enumerate(sorted(relations))}
                
    return ent_id, rel_id, edge_set


In [3]:
def get_codebook(path, D_ent, D_rel):
    """
    Generates a codebook for the set of entities and relations by randomly sampling from the unit hyperspheres of dimension
    D_ent and D_rel respectively. 
    
    input: path object, entity embedding dimension, relation embedding dimension
    
    output: dictionary of embeddings for entity and relation set (machine ids are keys for entity), plus output from get_id
    """
    ent_id, rel_id, edge_set = get_sets(path)
    
    def normal_vec(dim):
        vec = np.random.multivariate_normal(np.zeros(dim),np.eye(dim))
        return vec/np.linalg.norm(vec)
    
    def nv_append1(dim):
        vec = np.random.multivariate_normal(np.zeros(dim),np.eye(dim))
        return np.append(1.,vec/np.linalg.norm(vec))

    
    ent_code = {k:normal_vec(D_ent) for (i,k) in enumerate(list(ent_id.keys()))}
    rel_code = {k:nv_append1(D_rel) for (i,k) in enumerate(list(rel_id.keys()))}
    
    return ent_id, rel_id, ent_code, rel_code, edge_set

In [4]:
def get_embeddings(path, split, D_ent, D_rel, rel_tensor = False):
    """
    For each entity, generate the embedding of its neighborhood subgraph (all edges involving the entity).
    
    input: path object, data split, entity embedding dim, relation embedding dim
    """
    ent_id, rel_id, ent_code, rel_code, edge_set = get_codebook(path, D_ent, D_rel)
    N_ent = len(ent_id)
    
    if rel_tensor == True:
        embeddings = np.zeros((N_ent,D_ent,D_ent,D_rel+1))
    else:
        embeddings = np.zeros((N_ent,D_ent,D_ent))
    
    with open(os.path.join(path, split), "r") as lines:
        for line in lines:
            lhs, rel, rhs = line.strip().split("\t")
            head = ent_code[lhs]
            tail = ent_code[rhs]
            relation = rel_code[rel]
            if rel_tensor == True:
                tensor = np.einsum('i,j,k -> ijk',head,tail,relation)
            else:
                tensor = np.einsum('i,j -> ij', head, tail)
            embeddings[ent_id[lhs]] += tensor
            embeddings[ent_id[rhs]] += tensor
                
    return embeddings, ent_id, rel_id, ent_code, rel_code, edge_set


In [5]:
path = '/home/shazoop/KG-Embeddings/datasets/FB15K-237'

In [6]:
ent_id,rel_id, edge_set =get_sets(path)

In [7]:
def avg_ePn(edge_set):
    '''
    Estimates average edges per node by just dividing number of edges by number of nodes
    Input is the 'edge_set' output from any of the "get_..." functions
    '''
    edge_total = 0
    for key in list(edge_set.keys()):
        edge_total = edge_total + len(edge_set[key])
    return(edge_per_node = math.ceil(edge_total/len(ent_id)))

In [8]:
embeddings, ent_id, rel_id, ent_code, rel_code, edge_set = get_embeddings(path,'train.txt',100,100)

In [9]:
device = torch.device(1)

    Found GPU%d %s which is of cuda capability %d.%d.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is %d.%d.
    


In [None]:
def embed_to_device(ent_code,rel_code, ent_embeddings, rel_embeddings, device):
    ent_embeddings = torch.from_numpy(np.stack(list(ent_code.values()),0))
    rel_embeddings = torch.from_numpy(np.stack(list(rel_code.values()),0))
    ent_embeddings = ent_embeddings.to(device)
    rel_embeddings = rel_embeddings.to(device)
    nbd_embeddings = torch.from_numpy(embeddings).to(device) 
    return ent_embeddings, rel_embeddings, ent_embeddings, rel_embeddings, ent_embeddings

In [15]:
ent_embeddings = torch.from_numpy(np.stack(list(ent_code.values()),0))
rel_embeddings = torch.from_numpy(np.stack(list(rel_code.values()),0))
ent_embeddings = ent_embeddings.to(device)
rel_embeddings = rel_embeddings.to(device)
nbd_embeddings = torch.from_numpy(embeddings).to(device)

In [16]:
def tch_projmatrix(ent, ent_embeddings):
    '''
    Given the code for an entity h, will return a batch of projection matrices for each entity e.
    If h,e are vector embeddings, then will compute eh^T for each entity e. Returns the batch of these matrices.
    
    Input: ent (vector embedding), dictionary of entity vector embeddings
    
    Output: batch of projection matrices, one for each entity
    '''
    return torch.einsum('ni,j ->nij',ent_embeddings,ent)

In [29]:
def tch_score(edge,ent_embeddings, rel_embeddings, nbd_embeddings, ent_id, rel_id, k, rel_tensor= False):
    '''
    Score the proposed relation (h,r,t) by using similarity matching. Assume tail is the variable.
    we just use connectivity here and ignore relations when computing similarity
    edges is a tuple (mid1, relation, mid2)
    '''
    #Get the ids/vector embeddings for head, tail, relation. Get nbd embedding for head
    head, relation ,tail = edge[0], edge[1], edge[2]
    head_ix, tail_ix, relation_ix = ent_id[head], ent_id[tail], rel_id[relation]
    head_embed, tail_embed, rel_embed = ent_embeddings[head_ix], ent_embeddings[tail_ix], rel_embeddings[relation_ix]
    
    if rel_tensor == True:
        head_nbd = nbd_embeddings[head_ix,:,:,0] #just use edge connectivity
        nbd_embed_norel = nbd_embeddings[:,:,:,0]
    else:
        head_nbd = nbd_embeddings[head_ix]
        nbd_embed_norel = nbd_embeddings
    
    #Generate the projection matrices for the given head
    proj = tch_projmatrix(head_embed, ent_embeddings)
    Frob_norm = torch.norm(head_nbd)**2
    #Generate the 
    
    #Compute the graph homomorphism coeff
    x = torch.einsum('nij,jk -> nik',proj,head_nbd) #left multiply by projection matrix
    x = torch.einsum('nij,nkj -> nik',x,proj) #right multiply by transpose
    res = torch.bmm(x.permute(0,2,1),nbd_embed_norel)
    coeff = (1/Frob_norm)*torch.einsum('nii -> n',res) #number of matching edges
    
    #Get the top k
    (val,ix) = torch.topk(coeff,k)
    edge_coeff = torch.zeros(k-1).to(device)
    for i in range(1,k): #ignore top match, since that's likely to be nbd_embedding of head itself
        curr_ix = ix[i].cpu().item()
        curr_head_embed = ent_embeddings[curr_ix]
        edge_coeff[i-1] = torch.einsum('i,ij,j->',curr_head_embed,nbd_embeddings[curr_ix],tail_embed).cpu().item()
    
    #Compute score by weighting each score (-1,1) by softmax of the similarities
    score = torch.dot(torch.nn.functional.softmax(val[1:],dim=0).float(),edge_coeff)
    
    return score
    
    

In [None]:
def get_filtered_embeddings(path, split, D_ent, D_rel, max_edges, rel_tensor = False):
    """
    For each entity, generate the embedding of its neighborhood subgraph (all edges involving the entity).
    
    input: path object, data split, entity embedding dim, relation embedding dim
    """
    ent_id, rel_id, ent_code, rel_code, edge_set = get_codebook(path, D_ent, D_rel)
    
    #Remove entities with more edges than max_edges
    for k in list(ent_id.keys()):
    if len([i for i,j in enumerate(edge_list) if (j[0] == k or j[2] == k)]) >= max_edges:
        del ent_id[k]
        del ent_code[k]
    
    N_ent = len(ent_id)
    if rel_tensor == True:
        embeddings = np.zeros((N_ent,D_ent,D_ent,D_rel+1))
    else:
        embeddings = np.zeros((N_ent,D_ent,D_ent))
    
    with open(os.path.join(path, split), "r") as lines:
        for line in lines:
            lhs, rel, rhs = line.strip().split("\t")
            head = ent_code[lhs]
            tail = ent_code[rhs]
            relation = rel_code[rel]
            if rel_tensor == True:
                tensor = np.einsum('i,j,k -> ijk',head,tail,relation)
            else:
                tensor = np.einsum('i,j -> ij', head, tail)
            embeddings[ent_id[lhs]] += tensor
            embeddings[ent_id[rhs]] += tensor
                
    return embeddings, ent_id, rel_id, ent_code, rel_code, edge_set


In [30]:
k = 20

In [31]:
edge = list(edge_set['test.txt'])[100]

In [32]:
edge

('/m/0g2c8',
 '/award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee',
 '/m/053yx')

In [33]:
tch_score(edge,ent_embeddings, rel_embeddings, nbd_embeddings, ent_id, rel_id, k, rel_tensor= False)

tensor(0.0054, device='cuda:1')

In [81]:
head_embed = ent_embeddings[40]
tail_embed = ent_embeddings[3000]

In [82]:
proj = tch_projmatrix(head_embed, ent_embeddings)
head_nbd = nbd_embeddings[0]
Frob_norm = torch.norm(head_nbd)**2

In [83]:
x = torch.einsum('nij,jk -> nik',proj,head_nbd) #left multiply by projection matrix
x = torch.einsum('nij,nkj -> nik',x,proj) #right multiply by transpose
res = torch.bmm(x.permute(0,2,1),nbd_embeddings)
coeff = (1/Frob_norm)*torch.einsum('nii -> n',res) #number of matching edges

In [84]:
k = 20
(val,ix) = torch.topk(coeff,k)
edge_coeff = torch.zeros(k-1).to(device)
for i in range(1,k): #ignore top match, since that's likely to be nbd_embedding of head itself
    curr_ix = ix[i].cpu().item()
    edge_coeff[i-1] = torch.einsum('i,ij,j->',head_embed,nbd_embeddings[curr_ix],tail_embed).cpu().item()

In [None]:
val()

In [71]:
torch.einsum('i,ij,j->',head_embed,nbd_embeddings[5625],tail_embed).cpu().item()

-0.005081898177581201

In [93]:
edge_list = list(sorted(edge_set['train.txt']))

In [98]:
problem_id = list(ent_id.keys())[5625]

In [92]:
torch.norm(nbd_embeddings[5625])**2

tensor(1809.9085, device='cuda:1', dtype=torch.float64)

In [102]:
len([i for i,j in enumerate(edge_list) if j[0] == problem_id])

689

In [105]:
ent_id

{'/m/010016': 0,
 '/m/0100mt': 1,
 '/m/0102t4': 2,
 '/m/0104lr': 3,
 '/m/0105y2': 4,
 '/m/0106dv': 5,
 '/m/0108xl': 6,
 '/m/0109vk': 7,
 '/m/010bnr': 8,
 '/m/010bxh': 9,
 '/m/010cw1': 10,
 '/m/010dft': 11,
 '/m/010h9y': 12,
 '/m/010hn': 13,
 '/m/010m55': 14,
 '/m/010nlt': 15,
 '/m/010p3': 16,
 '/m/010r6f': 17,
 '/m/010rvx': 18,
 '/m/010t4v': 19,
 '/m/010tkc': 20,
 '/m/010v8k': 21,
 '/m/010xjr': 22,
 '/m/010y34': 23,
 '/m/010z5n': 24,
 '/m/0113sg': 25,
 '/m/0114m0': 26,
 '/m/0118d3': 27,
 '/m/011_3s': 28,
 '/m/011_6p': 29,
 '/m/011_vz': 30,
 '/m/011hdn': 31,
 '/m/011hq1': 32,
 '/m/011j5x': 33,
 '/m/011k11': 34,
 '/m/011k1h': 35,
 '/m/011k4g': 36,
 '/m/011k_j': 37,
 '/m/011kn2': 38,
 '/m/011lpr': 39,
 '/m/011lvx': 40,
 '/m/011pcj': 41,
 '/m/011s0': 42,
 '/m/011s9r': 43,
 '/m/011v3': 44,
 '/m/011vx3': 45,
 '/m/011w20': 46,
 '/m/011w4n': 47,
 '/m/011w54': 48,
 '/m/011wdm': 49,
 '/m/011wtv': 50,
 '/m/011x_4': 51,
 '/m/011xg5': 52,
 '/m/011xhx': 53,
 '/m/011xjd': 54,
 '/m/011xy1': 55,
 '/m/0

In [108]:
for k in list(ent_id.keys()):
    if len([i for i,j in enumerate(edge_list) if (j[0] == k or j[2] == k)]) >= 100:
        del ent_id[k]
        del ent_code[k]