## Get 80% train, 20% test

In [None]:
import numpy as np
import pandas as pd

train_mask = pd.read_csv("./datasets/train2.csv").to_numpy()
test_mask = pd.read_csv("./datasets/test2.csv").to_numpy()
# test_negative_mask = pd.read_csv("./datasets/test_only_negative2.csv").to_numpy()
# print("Train samples: "+str(len(train_mask)) + " Test samples: "+str(len(test_mask)) + " Test negative samples: "+str(len(test_negative_mask)))
print("Train samples: "+str(len(train_mask)) + " Test samples: "+str(len(test_mask)))
# nodes_train = np.unique(np.concatenate((train_mask[:,0],train_mask[:,1]))) 
# nodes_test = np.unique(np.concatenate((test_mask[:,0],test_mask[:,1])))

## Read graph of metafeatures

In [None]:
import networkx as nx
g_x = nx.read_gpickle("./word_embeddings/encoded_fasttext.gpickle")
#g_x = nx.read_gpickle("./word_embeddings/encoded_bert.gpickle")
#g_x = nx.read_gpickle("./word_embeddings/siimple.gpickle")
order = 0
for x,n in sorted(g_x.nodes(data=True)):
    t = n['tipo']
    if t == "dataset":
        n['tipo'] = 0
    if t == "feature dataset":
        n['tipo'] = 1
    if t == "literal dataset":
        n['tipo'] = 2
    if t == "attribute":
        n['tipo'] = 3
    if t == "feature attribute":
        n['tipo'] = 4
    if t == "literal attribute":
        n['tipo'] = 5  
    n['order']=order
    order+=1
    
datasets = [x for (x,y) in g_x.nodes(data=True) if y['tipo']==0]
order = [y['order'] for x,y in g_x.nodes(data=True) if y['tipo']==0]
map_order = dict(zip(datasets,order))
map_reverse_order = dict(zip(order,datasets))
map_order['DS_1']

for mask in train_mask:
    mask[0] = map_order["DS_"+str(mask[0])]
    mask[1] = map_order["DS_"+str(mask[1])]
for mask in test_mask:
    mask[0] = map_order["DS_"+str(mask[0])]
    mask[1] = map_order["DS_"+str(mask[1])]
# for mask in test_negative_mask:
#     mask[0] = map_order["DS_"+str(mask[0])]
#     mask[1] = map_order["DS_"+str(mask[1])]

## Deep graph library

In [None]:
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
#convert from networkx to graph deep library format
g = dgl.DGLGraph()
#gdl.from_networkx(g,['vector'])
g.from_networkx(g_x,node_attrs=['tipo','vector'], edge_attrs=None)
g_x = None

## Helpers

In [None]:
def resultSet_train(features,mask):
    v1 = []
    v2 = []
    labels = []
    for n in mask:
        v1.append(features[n[0]])
        v2.append(features[n[1]])
        if n[2] == 0:
            n[2] = -1
        labels.append(n[2])
    return th.stack(v1),th.stack(v2),th.tensor(labels)

def resultSet_train_softmax(features,mask,labels):
    v1 = []
    labels_out = [] 
    loaded = []
    for n in mask:
        if n[0] not in loaded:
            loaded.append(n[0])
            v1.append(features[n[0]])
            labels_out.append(labels[n[0]])
        if n[1] not in loaded:
            loaded.append(n[1])
            v1.append(features[n[1]])
            labels_out.append(labels[n[0]])
    return th.stack(v1),th.tensor(labels_out)

def normalization(vector):
    return (vector / th.norm(vector))

from scipy.spatial.distance import cosine
def resultSet_eval(features,mask,loss,sim=0.7):
    indices = []
    labels = []
    
    if loss == "ContrastiveLoss" or loss == "Euclidean":
        pdist = th.nn.PairwiseDistance(p=2)        
        v1 = features[mask[:,0]]
        v2 = features[mask[:,1]]
        result = pdist(v1,v2)
        for r in result:
            if r.item() <= sim:
                indices.append(1.0)
            else:
                indices.append(0.0)
                
        return th.tensor(indices),th.tensor(mask[:,2])
    
    if loss == "CosineEmbeddingLoss":
        for n in mask:

            cos = th.nn.CosineSimilarity(dim=0, eps=1e-6)
            result = cos(features[n[0]],features[n[1]])

            if result.item() >= sim:
                out = th.tensor(1)
            else:
                out = th.tensor(0)

            indices.append(out)
            labels.append(n[2])
        return th.tensor(indices),th.tensor(labels)

def evaluate(model, g, features, mask,loss,eval_sim):
    model.eval()
    with th.no_grad():
        embeddings = model(g, features)
        indices , labels = resultSet_eval(embeddings,mask,loss,eval_sim)
        correct = th.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)


In [None]:
def test_model(sim):
#     total_data = np.concatenate((test_mask,train_mask))
#     neg_data = np.array([x for x in total_data if x[2]==0])
#     pos_data = np.array([x for x in total_data if x[2]==1])
#     print(len(neg_data))
#     print(len(pos_data))
#     acc_pos = evaluate(net, g, g.ndata['vector'], pos_data,"CosineEmbeddingLoss",sim)
#     acc_neg = evaluate(net, g, g.ndata['vector'], neg_data,"CosineEmbeddingLoss",sim)
#     return ("Accuracy with possitive: "+ str(acc_pos) + " and negatives: " + str(acc_neg))
    return evaluate(net, g, g.ndata['vector'], train_mask,"CosineEmbeddingLoss",sim)
# test_model(0.8)

In [None]:
def test_similarity(training):
    total_data = np.concatenate((test_mask,train_mask))
    pos_data = np.array([x for x in total_data if x[2]==1])
    training.net.eval()
    embeddings = training.net(g, g.ndata['vector'])
    ds_embeddings = embeddings[list(map_reverse_order.keys())]
    #print(ds_embeddings.size())
    pdist = th.nn.PairwiseDistance(p=2)        
    thecos = th.nn.CosineSimilarity(dim=1, eps=1e-6)
    sum_accuracy = 0
    no_pairs = 0
    check_vector1 = []
    check_vector2 = []
    check_vector = []
    for i in map_reverse_order:
        candidate = embeddings[i]
        if training.loss_name == "ContrastiveLoss":
            result = pdist(candidate,ds_embeddings)
            closest_indices = th.topk(result, 4, largest=False).indices[1:]
            closest = np.array([list(map_reverse_order.keys())[x] for x in closest_indices])
            check_vector1 = np.array([x[1] for x in pos_data if x[0]==i and x[1] not in check_vector])
            check_vector2 = np.array([x[0] for x in pos_data if x[1]==i and x[0] not in check_vector])
            check_vector = np.concatenate((check_vector1,check_vector2),axis=0)
            closest_checked = np.array([x for x in check_vector if x in closest])
            if len(check_vector) > 0:
                accuracy = len(closest_checked) / min(len(closest),len(check_vector))
                sum_accuracy = sum_accuracy + accuracy
#                 print("Accuracy for: "+ str(i) + " = " + str(accuracy))
            else: 
                no_pairs +=1
#                 print("No pairs, best distance is: "+ str(result[closest_indices[0]]))
        if training.loss_name == "CosineEmbeddingLoss":
            result = th.clamp(thecos(candidate.reshape(1,len(candidate)),ds_embeddings),min=0)
            closest_indices = th.topk(result, 20, largest=True).indices[1:]
            closest = np.array([list(map_reverse_order.keys())[x] for x in closest_indices])
            check_vector1 = np.array([x[1] for x in pos_data if x[0]==i and x[1] not in check_vector])
            check_vector2 = np.array([x[0] for x in pos_data if x[1]==i and x[0] not in check_vector])
            check_vector = np.concatenate((check_vector1,check_vector2),axis=0)
            closest_checked = np.array([x for x in check_vector if x in closest])
            if len(check_vector) > 0:
                accuracy = len(closest_checked) / min(len(closest),len(check_vector))
                sum_accuracy = sum_accuracy + accuracy
#                 print("Accuracy for: "+ str(i) + " = " + str(accuracy))
            else: 
                no_pairs +=1
#                 print("No pairs, best distance is: "+ str(result[closest_indices[0]]))
    return sum_accuracy / (len(map_reverse_order) - no_pairs)

In [None]:
#   test_similarity("CosineEmbeddingLoss")

# Training
### NN architectures: 
0: 'Bert_768',  <br>
1: 'Fasttext_150', <br>
2: 'Fasttext_150_150_100',  <br>
3: 'Fasttext_300' <br>
### Loss functions: 
0: 'ContrastiveLoss', <br>
1: 'CosineEmbeddingLoss', <br>
2: 'Euclidean' <br>
#### Example to define architecture and loss
import step3_gcn_nn_concatenate as gcn_nn <br>
import step3_gcn_loss as gcn_loss <br>
print(gcn_nn.get_options()) #list of options<br>
print(gcn_loss.get_options()) #list of options<br>
print(gcn_nn.get_instance(option=0,name=None)) #or gcn_nn.get_instance(option=None,name="Bert_768") <br> print(gcn_loss.get_instance(0,margin=0.5,reduction="sum")) #or gcn_loss.get_instance(0,margin=0.5,reduction="sum") <br>

### Load training class to save/load/train experiments:
import step3_gcn_train as gcn_train


In [None]:
import time
import numpy as np
def train(training,iterations):
    dur = []
    ## training.splits indicates number of sets to split, not batch size!
    train_batch = np.array_split(train_mask,training.batch_splits)
    #th.set_num_threads(2)
    for epoch in range(iterations):
        training.net.train()
        t0 = time.time()
        for split in train_batch:
            embeddings = training.net(g, g.ndata['vector'])
            v1,v2,labels = resultSet_train(embeddings,split)
            loss = training.loss(v1,v2, labels)
            training.optimizer.zero_grad()
            #loss.backward(retain_graph=True)
            loss.backward()
            training.optimizer.step()

        #runtime
        t = time.time() - t0
        dur.append(t)
        training.runtime_seconds+=t
        
        #accuracy
        acc = evaluate(training.net, g, g.ndata['vector'], test_mask,training.loss_name, 0.90)
        acc2 = test_similarity(training)
        
        #create log
        output = {}
        output['epoch'] = training.epochs_run
        output['loss'] = float('%.5f'% (loss.item()))
        output['acc'] = float('%.5f'% (acc))
        output['acc2'] = float('%.5f'% (acc2))
        output['time_epoch'] = float('%.5f'% (np.mean(dur)))
        output['time_total'] = float('%.5f'% (training.runtime_seconds))
        training.log.append(output)
        training.epochs_run+=1
        print(str(output))
        
    #write results and save model to files
    training.save_state()

## Config and run training

In [None]:
import step3_gcn_nn_concatenate as gcn_nn
import step3_gcn_loss as gcn_loss
import step3_gcn_training as gcn_training

training = gcn_training.Training()

#load model from path
training.load_state(path="./models/net_name:Fasttext_150|batch_splits:40.0000|lr:0.0010|loss_name:CosineEmbeddingLoss|loss_parameters:0.0+mean.pt")

#train new model and specify parameters
# training.set_training(
#             net_name= gcn_nn.get_option_name(1),
#             batch_splits=40,
#             lr=1e-3,
#             loss_name=gcn_loss.get_option_name(1),
#             loss_parameters="0.0+mean")

train(training,iterations=1)