In [1]:
import collections
import os
import pickle
import random
import numpy as np
import math
import torch

In [2]:
def get_sets(path):
    """
    Parses the raw FB text files and returns the set of all entities(their machine ids) and relations
    
    !!!The entities are sorted before assigning them to an index
    
    Input: path object
    
    Output: dictionaries of unique integer index to each entity and relation, set of edges for each split
    """
    entities, relations = set(), set()
    edge_set = {}
    for split in ["train.txt", "valid.txt", "test.txt"]:
        with open(os.path.join(path, split), "r") as lines:
            edges = set()
            for line in lines:
                lhs, rel, rhs = line.strip().split("\t")
                entities.add(lhs)
                entities.add(rhs)
                relations.add(rel)
                edges.add((lhs,rel,rhs))
        edge_set[split] = edges
    
    ent_id = {k:i for (i,k) in enumerate(sorted(entities))}
    rel_id = {k:i for (i,k) in enumerate(sorted(relations))}
                
    return ent_id, rel_id, edge_set


In [3]:
def get_codebook(path, D_ent, D_rel):
    """
    Generates a codebook for the set of entities and relations by randomly sampling from the unit hyperspheres of dimension
    D_ent and D_rel respectively. 
    
    input: path object, entity embedding dimension, relation embedding dimension
    
    output: dictionary of embeddings for entity and relation set (machine ids are keys for entity), plus output from get_id
    """
    ent_id, rel_id, edge_set = get_sets(path)
    
    def normal_vec(dim):
        vec = np.random.multivariate_normal(np.zeros(dim),np.eye(dim))
        return vec/np.linalg.norm(vec)
    
    def nv_append1(dim):
        vec = np.random.multivariate_normal(np.zeros(dim),np.eye(dim))
        return np.append(1.,vec/np.linalg.norm(vec))

    
    ent_code = {k:normal_vec(D_ent) for (i,k) in enumerate(list(ent_id.keys()))}
    rel_code = {k:nv_append1(D_rel) for (i,k) in enumerate(list(rel_id.keys()))}
    
    return ent_id, rel_id, ent_code, rel_code, edge_set

In [22]:
def get_embeddings(path, split, D_ent, D_rel, rel_tensor = False):
    """
    For each entity, generate the embedding of its neighborhood subgraph (all edges involving the entity).
    
    input: path object, data split, entity embedding dim, relation embedding dim
    """
    ent_id, rel_id, ent_code, rel_code, edge_set = get_codebook(path, D_ent, D_rel)
    N_ent = len(ent_id)
    
    if rel_tensor == True:
        embeddings = np.zeros((N_ent,D_ent,D_ent,D_rel+1))
    else:
        embeddings = np.zeros((N_ent,D_ent,D_ent))
    
    with open(os.path.join(path, split), "r") as lines:
        for line in lines:
            lhs, rel, rhs = line.strip().split("\t")
            head = ent_code[lhs]
            tail = ent_code[rhs]
            relation = rel_code[rel]
            if rel_tensor == True:
                tensor = np.einsum('i,j,k -> ijk',head,tail,relation)
            else:
                tensor = np.einsum('i,j -> ij', head, tail)
            embeddings[ent_id[lhs]] += tensor
            embeddings[ent_id[rhs]] += tensor
                
    return embeddings, ent_id, rel_id, ent_code, rel_code, edge_set


In [16]:
path = '/home/shazoop/KG-Embeddings/datasets/FB15K-237'

In [17]:
ent_id,rel_id, edge_set =get_sets(path)

In [18]:
edge_total = 0
for key in list(edge_set.keys()):
    edge_total = edge_total + len(edge_set[key])
edge_per_node = math.ceil(edge_total/len(ent_id))

In [23]:
embeddings, ent_id, rel_id, ent_code, rel_code, edge_set = get_embeddings(path,'train.txt',2*edge_per_node,2*edge_per_node)

In [24]:
device = torch.device(1)
torch.cuda.set_device(1)

    Found GPU%d %s which is of cuda capability %d.%d.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is %d.%d.
    


In [25]:
ent_embeddings = torch.from_numpy(np.stack(list(ent_code.values()),0))
rel_embeddings = torch.from_numpy(np.stack(list(rel_code.values()),0))
ent_embeddings = ent_embeddings.to(device)
rel_embeddings = rel_embeddings.to(device)
nbd_embeddings = torch.from_numpy(embeddings).to(device)

In [26]:
def tch_projmatrix(ent, ent_embeddings):
    '''
    Given the code for an entity h, will return a batch of projection matrices for each entity e.
    If h,e are vector embeddings, then will compute eh^T for each entity e. Returns the batch of these matrices.
    
    Input: ent (vector embedding), dictionary of entity vector embeddings
    
    Output: batch of projection matrices, one for each entity
    '''
    return torch.einsum('ni,j ->nij',ent_embeddings,ent)

In [29]:
def tch_score(edges,ent_embeddings, rel_embeddings, nbd_embeddings, ent_id, rel_id, rel_tensor= False):
    '''
    Score the proposed relation (h,r,t) by using similarity matching. Assume tail is the variable.
    we just use connectivity here and ignore relations when computing similarity
    edges is a tuple (mid1, relation, mid2)
    '''
    #Get the ids/vector embeddings for head, tail, relation. Get nbd embedding for head
    head, relation ,tail = edge[0], edge[1], edge[2]
    head_ix, tail_ix, relation_ix = ent_id[head], ent_id[tail], rel_id[relation]
    head_embed, tail_embed, rel_embed = ent_embeddings[head_ix], ent_embeddings[tail_ix], rel_embeddings[relation_ix]
    
    if rel_tensor == True:
        head_nbd = nbd_embeddings[head_ix,:,:,0] #just use edge connectivity
        nbd_embed_norel = nbd_embeddings[:,:,:,0]
    else:
        head_nbd = nbd_embeddings[head_ix]
        nbd_embed_norel = nbd_embeddings
    
    #Generate the projection matrices for the given head
    proj = tch_projmatrix(head_embed, ent_embeddings)
    Frob_norm = torch.norm(head_nbd)**2
    #Generate the 
    
    #Compute the graph homomorphism coeff
    x = torch.einsum('nij,jk -> nik',proj,head_nbd) #left multiply by projection matrix
    x = torch.einsum('nij,nkj -> nik',x,proj) #right multiply by transpose
    res = torch.bmm(x.permute(0,2,1),nbd_embeddings_norel)
    coeff = (1/Frob_norm)*torch.einsum('nii -> n',res) #number of matching edges
    
    
    
    

In [30]:
proj = tch_projmatrix(ent_embeddings, head_embed)
head_nbd = nbd_embeddings[0]
Frob_norm = torch.norm(head_nbd)**2


In [35]:
x = torch.einsum('nij,jk -> nik',proj,head_nbd) #left multiply by projection matrix
x = torch.einsum('nij,njk -> nik',x,proj.permute(0,2,1)) #right multiply by transpose
res = torch.bmm(x.permute(0,2,1),nbd_embeddings)
coeff = (1/Frob_norm)*torch.einsum('nii -> n',res) #number of matching edges


In [38]:
head = ent_embeddings[0]
tail = ent_embeddings[100]

In [39]:
torch.mm(proj[100],)

tensor([[ 0.0106, -0.0089,  0.0157,  ..., -0.0141,  0.0116,  0.0064],
        [-0.0091,  0.0076, -0.0134,  ...,  0.0121, -0.0099, -0.0055],
        [ 0.0039, -0.0032,  0.0057,  ..., -0.0052,  0.0042,  0.0023],
        ...,
        [ 0.0093, -0.0078,  0.0138,  ..., -0.0124,  0.0101,  0.0056],
        [ 0.0068, -0.0057,  0.0101,  ..., -0.0091,  0.0074,  0.0041],
        [ 0.0237, -0.0197,  0.0349,  ..., -0.0314,  0.0257,  0.0142]],
       device='cuda:1', dtype=torch.float64)

In [41]:
torch.einsum('i,j->ij',head,tail)

tensor([[ 0.0106, -0.0091,  0.0039,  ...,  0.0093,  0.0068,  0.0237],
        [-0.0089,  0.0076, -0.0032,  ..., -0.0078, -0.0057, -0.0197],
        [ 0.0157, -0.0134,  0.0057,  ...,  0.0138,  0.0101,  0.0349],
        ...,
        [-0.0141,  0.0121, -0.0052,  ..., -0.0124, -0.0091, -0.0314],
        [ 0.0116, -0.0099,  0.0042,  ...,  0.0101,  0.0074,  0.0257],
        [ 0.0064, -0.0055,  0.0023,  ...,  0.0056,  0.0041,  0.0142]],
       device='cuda:1', dtype=torch.float64)

In [None]:
head_comp = torch.bmm(torch.bmm(proj,nbd_embeddings[:,:,:,0]),proj.permute((0,2,1)))
res = torch.bmm(head_comp.permute(0,2,1),nbd_embeddings[:,:,:,0])
coeff = (1/Frob_norm)*torch.einsum('nii -> n',res)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(coeff.cpu().numpy(), bins = 10)

In [None]:
coeff.min()

In [None]:
ent_embeddings.shape

In [None]:
a = torch.rand(3,2,2)

In [None]:
a

In [None]:
torch.einsum('nii->n',a)