In [None]:
import sys
from sys import getsizeof
import torch
from tqdm import tqdm
from typing import Dict, List
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import random
import networkx as nx
import numpy as np


In [None]:
def write_log(s, path = 'log.out', prnt = True):
    ''
    
    f = open(path, "a")
    f.write('\n' + s)
    if prnt:
        print(s)
    f.close()

def load_node2vec(path: str):
    ''
    
    embeddings_dict = {}
    with open(path, "r") as f:
        for i, line in enumerate(tqdm(f)):
            vals = line.strip().split()
            embeddings_dict[vals[0]] = [float(x) for x in vals[1:]]

    print("### Node2Vec loaded")
    return embeddings_dict

def load_glove(path: str):
    ''
    
    embeddings_dict = {}
    with open(path, "r") as f:
        for i, line in enumerate(tqdm(f)):
            vals = line.rstrip().split()
            embeddings_dict[vals[0]] = [float(x) for x in vals[1:]]

    dict_keys = list(embeddings_dict.keys())
    print("### Glove loaded, vocab size {len(embeddings_dict.keys())}")
    return embeddings_dict, dict_keys

def load_entities(path: str):
    ''

    entities_dict = {}
    with open(path) as f:
        for line in f.readlines():
            fields = line.strip().split('\t')
            entities_dict[ fields[0] ] = fields[1]
    
    print('\nEntities loaded')
    return entities_dict

def load_relations(path: str):
    ''

    relations = []
    with open(path) as f:
        for line in f.readlines():
            relations.append(line.strip())
    
    print('\nRelations loaded')
    return relations

In [None]:
class KGDataset(Dataset):
    'Dataset'

    def __init__(self, path: str,
        entities_dict: dict,
        descriptions_dict: dict,
        node2vec: list,
        relations: list,
        device,
        settings: dict,
        embeddings_dict: dict,
        dict_keys,
        ) -> None:
        ''

        self.entities_dict = entities_dict
        self.descriptions_dict = descriptions_dict
        self.relations = relations
        self.device = device
        self.settings = settings
        self.node2vec = node2vec
        self.embeddings_dict = embeddings_dict
        self.dict_keys = dict_keys
        self.stats = {}
        self.stats['glove_greedy'] = set()
        self.stats['glove_not_found'] = set()

        # Read file
        # DO NOT FORGET TO Load the graph once and make it global
        f = open(path)
        file_lines = f.readlines()
        f.close()

        self.lines = file_lines[: int(len(file_lines) * settings['SAMPLE_SIZE']) ]
        print(f'\nopened {path}' )
        self.G = nx.MultiGraph()
        for i in range( int( len(self.lines) ) ):
            items = self.lines[i].strip().split('\t')

            # Triple has 3 only
            assert len(items) == 3

            items = [p.strip() for p in items]

            head = items[0].strip()
            tail = items[2].strip()
            rel = items[1].strip()
            self.G.add_node(head)
            self.G.add_node(tail)
            self.G.add_edge(head, tail, name = rel)
        
        write_log(str(self.G))
        
    def gettext(self, index):
        ''

        fields = self.lines[index].strip().split('\t')
        head = self.entities_dict[fields[0]]
        tail = self.entities_dict[fields[2]]
        rel = fields[1]
        return head, rel, tail, fields[0],fields[2]
    
    def __len__(self) -> int:
        ''

        return len(self.lines)
    
    def get_glove_embedding(self, word):
        'Gets the embeddings of a word from Glove vectors'
        
        wrd = str(word).lower()
        if wrd in self.embeddings_dict:
            return self.embeddings_dict[wrd]
        
        # Greedy largest chunk
        lngth = len(wrd)
        for i in range(lngth - 1, 2, -1):
            for j in range(lngth - i + 1):
                sub_wrd = wrd[j:j + i]
                if sub_wrd in self.embeddings_dict:
                    return self.embeddings_dict[sub_wrd]
        
        char_embeddings = [
            self.embeddings_dict.setdefault(c, self.embeddings_dict[self.dict_keys[ord(c) % 100]]) for c in wrd
        ]
        averaged_embeddings = []
        for i in range(self.settings['DIMENSIONS']):
            tot = 0
            for j in range(len(char_embeddings)):
                tot += char_embeddings[j][i]
            averaged_embeddings.append(tot / self.settings['DIMENSIONS'])

        return averaged_embeddings
    
    def get_ent_embeddings(self, ent: str, is_head = False, use_description = False):
        ''

        embeddings = []
        entity_text = self.entities_dict[ent]

        # use description
        if use_description:
            entity_text = self.descriptions_dict[ent]

        words = entity_text.split()

        type_vector = [-1 * self.settings['VECTOR_VALUE']]
        if is_head:
            type_vector = [self.settings['VECTOR_VALUE']]

        # Check if longer than allowed
        max_size = self.settings['padding'] // 2
        if len(words) > max_size:
            for j in range( max_size - 1 ):
                embeddings.append( self.get_glove_embedding(words[j]) + type_vector  )
            
            remaining_embeddings = []
            for j in range( max_size - 1, len(words) ):
                remaining_embeddings.append( self.get_glove_embedding(words[j]) )
            
            embeddings.append( np.mean(remaining_embeddings, axis=0).tolist() + type_vector )
        
        else:
            for j in range( len(words) ):
                embeddings.append( self.get_glove_embedding(words[j]) + type_vector  )

        if True: #N2V
            embeddings.append(self.node2vec[ent] + type_vector)
            
        # fill
        if len(embeddings) < self.settings['padding'] // 2 + 1:
            filling = [((self.settings['DIMENSIONS'] + 1) * [0.0])] * (self.settings['padding'] // 2 + 1 - len(embeddings))
            embeddings.extend(filling)
        
                    
        # if is_head: # separator
        #     embeddings.append( self.settings['DIMENSIONS'] * [-1] )        
            
        return embeddings
    
    def __getitem__(self, index):
        ''

        fields = self.lines[index].strip().split('\t')
        rel = fields[1]

        assert len(fields) == 3

        # Prepare Y label
        relations_tagged = [0.0] * len(self.relations)
        rel_index = self.relations.index(rel)
        relations_tagged[ rel_index ] = 1.0

        embeddings = []
        head_embeddings = self.get_ent_embeddings(fields[0], is_head = True, )
        tail_embeddings = self.get_ent_embeddings(fields[2], is_head = False,  )
        embeddings.extend(head_embeddings)
        embeddings.extend(tail_embeddings)


        return torch.Tensor(embeddings),torch.Tensor(relations_tagged)

In [None]:
# https://spotintelligence.com/2023/01/31/self-attention/
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.softmax = nn.Softmax(dim=2)
        
    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        scores = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim ** 0.5)
        attention = self.softmax(scores)
        weighted = torch.bmm(attention, values)
        return weighted

In [None]:
class MyModel(torch.nn.Module):  
    def __init__(self,output_size, settings):
        ''
        
        super(MyModel, self).__init__()
        self.lstm = torch.nn.LSTM((settings['DIMENSIONS'] + 1), settings['lstm_hidden_size'] // 2,
                                num_layers = settings['lstm_layers'], bidirectional=True, batch_first = True)
        
        self.attn = SelfAttention(settings['lstm_hidden_size'])

        self.flatten = torch.nn.Flatten()
        self.dropout = torch.nn.Dropout(settings['dropout'])
        self.last = torch.nn.Linear(settings['lstm_hidden_size'] * (settings['padding'] + 2), output_size)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.attn(x)
        x = self.flatten(x)

        x = self.dropout(x)
        x = self.last(x)
        x = self.sigmoid(x)
        return x


In [None]:
def train_model(device, relations, settings, training_generator, validation_generator, verbose):
    ''

    mymodel = MyModel(output_size = len(relations), settings = settings)
    # loss_f = torch.nn.CrossEntropyLoss()
    loss_f = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(mymodel.parameters(), lr = settings['lr'], )
    scheduler = lr_scheduler.StepLR(optimizer, gamma=settings['decay'], step_size= settings['stepping'])
    mymodel.to(device)

    # Print settings
    for k, v in settings.items():
        write_log(f'{k:<25} {v}')
    
    v_loss = 1_000_000
    no_change_counter = 1
    for epoch in range(settings['EPOCHS']):
        print(f'\nEpoch {epoch + 1}\n-------------------------------')
        lr = optimizer.param_groups[0]['lr']
        mymodel.train()
        loop = tqdm(training_generator, disable = not verbose)
        losses = []

        # Loop over batches in an epoch using DataLoader
        for _, data in enumerate(loop):
            data[0] = data[0].to(device)
            data[1] = data[1].to(device)
            optimizer.zero_grad()
            predy = mymodel(data[0])
            loss = loss_f(predy, data[1])
            loss.backward()       
            optimizer.step()
            losses.append(loss.item())
        
        train_loss = sum(losses) / len(losses)
        lr = optimizer.param_groups[0]['lr']

        # Loop over batches in an epoch using DataLoader
        v_losses = []
        mymodel.eval()
        with torch.no_grad():

            for _, data in enumerate(validation_generator):
                data[0] = data[0].to(device)
                data[1] = data[1].to(device)
                predy =  mymodel(data[0])
                loss = loss_f(predy, data[1])
                v_losses.append(loss)

        v_loss_epoch = sum(v_losses) / len(v_losses)
        write_log(f'lr {lr:8f} train loss {train_loss:.8f} val loss {v_loss_epoch:.8f}')

        if v_loss - v_loss_epoch > 0.00001 :
            v_loss = v_loss_epoch
            no_change_counter = 0
            torch.save(mymodel.state_dict(), 'chkpnt.pt')
        elif no_change_counter > settings['PATIENCE'] - 1:
            break
        else:
            no_change_counter += 1
        
        scheduler.step()    
    
    mymodel = MyModel(output_size = len(relations), settings = settings)
    mymodel.to(device)
    mymodel.load_state_dict(torch.load('chkpnt.pt'))
    return mymodel

In [None]:
def evaluate(model, test_generator, device, verbose, settings, test_set):
    ''

    print(f'\nTesting\n-------------------------------')

    ranks = []
    ranks_filtered = []
    reciprocal_ranks = []
    reciprocal_ranks_filtered = []
    ranks_filtered = []
    hits = []
    hits_filtered = []
    for i in range(10):
            hits.append([])
            hits_filtered.append([])

    model.eval()
    with torch.no_grad():
        loop = tqdm(test_generator, disable = not verbose)
        for id, data in enumerate(loop):
            data[0] = data[0].to(device)
            data[1] = data[1].to(device)
            y_pred = model(data[0])
            for i, item in enumerate(y_pred):
                item_id = id * settings['BATCH_SIZE'] + i
                h, r, t, e1, e2 = test_set.gettext(item_id)
                gold_index = torch.argmax(data[1][i])
                indices = torch.argsort(item, descending = True)
                # get all the relations for this specific triple
                rank = (indices==gold_index).nonzero().item()

                # check rank
                if rank > 10:
                    # request input for description
                    embeddings = []
                    head_embeddings = test_set.get_ent_embeddings(e1, is_head = True, use_description = True )
                    tail_embeddings = test_set.get_ent_embeddings(e2, is_head = False, use_description = True )
                    embeddings.extend(head_embeddings)
                    embeddings.extend(tail_embeddings)
                    y_pred = model( torch.Tensor([embeddings]).to(device) )
                    gold_index2 = torch.argmax(data[1][i])
                    indices2 = torch.argsort(y_pred[0], descending = True)
                    # get all the relations for this specific triple
                    new_rank = (indices2==gold_index2).nonzero().item()
                    if new_rank < rank:
                        rank = new_rank
                        indices = indices2
                        gold_index = gold_index2

                ranks.append(rank + 1)
                reciprocal_ranks.append(1/(rank + 1))
                

                # filter work
                filter_rank = rank

                # get higher predicted relations
                # indices_list = indices.view(-1)
                indices_list = indices.tolist()
                higher_rels = indices_list[: indices_list.index(gold_index) ]
                
                # get gold relations for the triple
                triple = test_set.gettext(item_id)
                key = triple[3] + '_' + triple[4]
                rel_list = triples[key]

                # loop higher rels
                for j, rel_id in enumerate(higher_rels):
                    if rel_id in rel_list:
                        filter_rank -= 1
                
                ranks_filtered.append(filter_rank + 1)
                reciprocal_ranks_filtered.append(1/(filter_rank + 1))

                # Hits work
                for hits_level in range(10):
                    if rank <= hits_level:
                        hits[hits_level].append(1.0)
                    else:
                        hits[hits_level].append(0.0)
                    
                    if filter_rank <= hits_level:
                        hits_filtered[hits_level].append(1.0)
                    else:
                        hits_filtered[hits_level].append(0.0)
                    
                    if rank > 10 and hits_level == 9:
                        write_log(str(rank) + '^' +str(item_id) + '^' + h + '^' + r + '^' + t + '^' + e1 + '^' + e2,'failures.out', False)

    write_log(f'\n{"MR":<15} {np.mean(ranks):.4f}')
    write_log(f'{"MR Filtered":<15} {np.mean(ranks_filtered):.4f}')
    write_log(f'\n{"MRR":<15} {np.mean(reciprocal_ranks):.4f}')
    write_log(f'{"MRR Filtered":<15} {np.mean(reciprocal_ranks_filtered):.4f}')
    for i in [0,1,2,4,9]:
        # write_log('Raw Hits @{0}: {1}'.format(i+1, np.mean(hits[i])))
        write_log('')
        write_log(f'Raw Hits           {i + 1:<3}: {np.mean(hits[i]):<5.4f}')
        write_log(f'Filtered Hits  {i + 1:<3}: {np.mean(hits_filtered[i]):<5.4f}')
        # write_log('Raw Filtered Hits @{0}: {1}'.format(i+1, np.mean(hits_filtered[i])))


In [None]:
def get_relations(head, tail, relations, settings, device, training_set, entities_dict):
    print(entities_dict[head],',',entities_dict[tail])
    mymodel = MyModel(output_size = len(relations), settings = settings)
    mymodel.to(device)
    mymodel.load_state_dict(torch.load('chkpnt.pt'))
    mymodel.eval()
    with torch.no_grad():
        embeddings = []
        head_embeddings = training_set.get_ent_embeddings(head, is_head = True, )
        tail_embeddings = training_set.get_ent_embeddings(tail, is_head = False,  )
        embeddings.extend(head_embeddings)
        embeddings.extend(tail_embeddings)
        embeddings = [embeddings]
        embeddings = torch.Tensor(embeddings)
        embeddings = embeddings.to(device)
        result = mymodel(embeddings)
        for index, rel in enumerate(relations):
            # print(rel, result[0][index])
            print(f'{rel:>100} \t {result[0][index].item():.2f}%')

In [None]:
# settings

settings = {
    'SAMPLE_SIZE' :            0.001,
    'EPOCHS' :                 2,
    'VECTOR_VALUE' :           1,
    'DIMENSIONS' :             50,
    'PATIENCE' :               2,
    'BATCH_SIZE' :             32,
    'lstm_hidden_size' :       400,
    'lstm_layers' :            2,
    'dropout' :                0.2,
    'lr' :                     1e-3,
    'decay' :                  0.35,
    'padding' :                20 + 20,
    'stepping' :               1,
    'n2v_path' :               'n2v/n2v_embeddings50.csv',
    # 'n2v_path' :               'n2v/n2v_dimensions_300_walk_length_50_walks_50_window_10.csv',
    'entities_path' :          'data/FB15K/entity2text.txt',
    'descriptions_path' :          'data/FB15K/entity2textlong.txt',
    'relations_path' :         'data/FB15K/relations.txt'
}
settings['glove_path'] =       'glove.6B.' + str(settings['DIMENSIONS']) + 'd.txt'

In [None]:
# Loading data

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

embeddings_dict, dict_keys = load_glove(settings['glove_path'])
relations = load_relations(settings['relations_path'])
entities_dict = load_entities(settings['entities_path'])
descriptions_dict = load_entities(settings['descriptions_path'])
node2vec = load_node2vec(settings['n2v_path'])

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
write_log('\ndevice ' + str(device))

# Generators
training_set = KGDataset(path = 'data/FB15K/train.tsv', entities_dict = entities_dict, descriptions_dict = descriptions_dict , relations = relations, device=device, settings=settings, node2vec = node2vec, embeddings_dict = embeddings_dict, dict_keys = dict_keys,)
training_generator = DataLoader(training_set, batch_size = settings['BATCH_SIZE'], worker_init_fn=seed_worker, generator=g,)

validation_set = KGDataset(path = 'data/FB15K/dev.tsv', entities_dict = entities_dict, descriptions_dict = descriptions_dict , relations = relations, device=device, settings=settings, node2vec = node2vec, embeddings_dict = embeddings_dict, dict_keys = dict_keys,)
validation_generator = DataLoader(validation_set, batch_size = settings['BATCH_SIZE'], worker_init_fn=seed_worker, generator=g,)

test_set = KGDataset(path = 'data/FB15K/test.tsv', entities_dict = entities_dict, descriptions_dict = descriptions_dict , relations = relations, device=device, settings=settings, node2vec = node2vec, embeddings_dict = embeddings_dict, dict_keys = dict_keys,)
test_generator = DataLoader(test_set, batch_size = settings['BATCH_SIZE'], worker_init_fn=seed_worker, generator=g,)

# delete unnecessary

print('\ninput shape' + str(training_set[0][0].shape))


In [None]:
# model = train_model(device, relations, settings, training_generator, validation_generator, verbose = True)

triples = {}
for i, _ in enumerate(training_set):
    item = training_set.gettext(i)
    key = item[3] + '_' + item[4]
    rel_list = triples.setdefault(key, [])
    rel_index = training_set.relations.index(item[1])
    rel_list.append(rel_index)
    triples[key] = rel_list
for i, _ in enumerate(validation_set):
    item = validation_set.gettext(i)
    key = item[3] + '_' + item[4]
    rel_list = triples.setdefault(key, [])
    rel_index = validation_set.relations.index(item[1])
    rel_list.append(rel_index)
    triples[key] = rel_list
for i, _ in enumerate(test_set):
    item = test_set.gettext(i)
    key = item[3] + '_' + item[4]
    rel_list = triples.setdefault(key, [])
    rel_index = test_set.relations.index(item[1])
    rel_list.append(rel_index)
    triples[key] = rel_list


# evaluate(model, test_generator, device, verbose = True, settings=settings, test_set=test_set)


# tune
if True:
    for lr in [ 1e-3, 8e-3,]:        
        for l1 in [ 400]:
            for l2 in [ 2]:
                for d in [0.15, ]:
                    for dd in [0.5, 0.35, 0.25]:
                        for st in [1, ]:
                            tune_settings = settings
                            tune_settings['lr'] = lr
                            tune_settings['lstm_hidden_size'] = l1
                            tune_settings['lstm_layers'] = l2
                            tune_settings['dropout'] = d
                            tune_settings['decay'] = dd
                            tune_settings['stepping'] = st
                            model = train_model(device, relations, tune_settings, training_generator, validation_generator, verbose = False)
                            evaluate(model, test_generator, device, verbose = False, settings=settings, test_set=test_set)

In [None]:
# /m/0x67	/people/ethnicity/people	/m/0411q
# get_relations('/m/0x67', '/m/0411q', relations, settings, device, training_set, entities_dict)