## Get 80% train, 20% test

In [None]:
import numpy as np
import pandas as pd

train_mask = pd.read_csv("./datasets/train2.csv").to_numpy()
test_mask = pd.read_csv("./datasets/test2.csv").to_numpy()
test_negative_mask = pd.read_csv("./datasets/test_only_negative2.csv").to_numpy()
print("Train samples: "+str(len(train_mask)) + " Test samples: "+str(len(test_mask)) + 
     " Test negative samples: "+str(len(test_negative_mask)))
# nodes_train = np.unique(np.concatenate((train_mask[:,0],train_mask[:,1]))) 
# nodes_test = np.unique(np.concatenate((test_mask[:,0],test_mask[:,1])))

## Read graph of metafeatures

In [None]:
import networkx as nx
g_x = nx.read_gpickle("./word_embeddings/encoded_fasttext.gpickle")
#g_x = nx.read_gpickle("encoded_features.gpickle")
#g_x = nx.read_gpickle("siimple.gpickle")
order = 0
for x,n in sorted(g_x.nodes(data=True)):
    t = n['tipo']
    if t == "dataset":
        n['tipo'] = 0
    if t == "feature dataset":
        n['tipo'] = 1
    if t == "literal dataset":
        n['tipo'] = 2
    if t == "attribute":
        n['tipo'] = 3
    if t == "feature attribute":
        n['tipo'] = 4
    if t == "literal attribute":
        n['tipo'] = 5  
    n['order']=order
    order+=1
    
datasets = [x for (x,y) in g_x.nodes(data=True) if y['tipo']==0]
order = [y['order'] for x,y in g_x.nodes(data=True) if y['tipo']==0]
map_order = dict(zip(datasets,order))
map_reverse_order = dict(zip(order,datasets))
map_order['DS_1']

for mask in train_mask:
    mask[0] = map_order["DS_"+str(mask[0])]
    mask[1] = map_order["DS_"+str(mask[1])]
for mask in test_mask:
    mask[0] = map_order["DS_"+str(mask[0])]
    mask[1] = map_order["DS_"+str(mask[1])]
for mask in test_negative_mask:
    mask[0] = map_order["DS_"+str(mask[0])]
    mask[1] = map_order["DS_"+str(mask[1])]

## Deep graph library

In [None]:
import dgl
#convert from networkx to graph deep library format
g = dgl.DGLGraph()
#gdl.from_networkx(g,['vector'])
g.from_networkx(g_x,node_attrs=['tipo','vector'], edge_attrs=None)
g_x = None

In [None]:
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph

gcn_msg = fn.copy_src(src='vector', out='m')
gcn_reduce = fn.sum(msg='m', out='vector')

## Helpers

In [None]:
def resultSet_train(features,mask):
    v1 = []
    v2 = []
    labels = []
    for n in mask:
        v1.append(features[n[0]])
        v2.append(features[n[1]])
        if n[2] == 0:
            n[2] = -1
        labels.append(n[2])
    return th.stack(v1),th.stack(v2),th.tensor(labels)

def resultSet_train_softmax(features,mask,labels):
    v1 = []
    labels_out = [] 
    loaded = []
    for n in mask:
        if n[0] not in loaded:
            loaded.append(n[0])
            v1.append(features[n[0]])
            labels_out.append(labels[n[0]])
        if n[1] not in loaded:
            loaded.append(n[1])
            v1.append(features[n[1]])
            labels_out.append(labels[n[0]])
    return th.stack(v1),th.tensor(labels_out)

def normalization(vector):
    return (vector / th.norm(vector))

from scipy.spatial.distance import cosine
def resultSet_eval(features,mask,loss,sim=0.7):
    indices = []
    labels = []
    
    if loss == "ContrastiveLoss" or loss == "Euclidean":
        pdist = nn.PairwiseDistance(p=2)        
        v1 = normalization(features[mask[:,0]])
        v2 = normalization(features[mask[:,1]])
        result = pdist(v1,v2)
        for r in result:
            if r.item() <= sim:
                indices.append(1.0)
            else:
                indices.append(0.0)
                
        return th.tensor(indices),th.tensor(mask[:,2])
    
    if loss == "CosineEmbeddingLoss":
        for n in mask:

            cos = th.nn.CosineSimilarity(dim=0, eps=1e-6)
            result = cos(features[n[0]],features[n[1]])

            if result.item() >= sim:
                out = th.tensor(1)
            else:
                out = th.tensor(0)

            indices.append(out)
            labels.append(n[2])
        return th.tensor(indices),th.tensor(labels)

def evaluate(model, g, features, mask,loss,eval_sim):
    model.eval()
    with th.no_grad():
        embeddings = model(g, features)
        indices , labels = resultSet_eval(embeddings,mask,loss,eval_sim)
        correct = th.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

    
def write_to_file(text,id_file):
    MyFile=open(id_file+'.txt','w')
    MyFile.writelines(text)
    MyFile.close()

In [None]:
# def test_model():
#     PATH= "./models/threshold_simmetry 0.0170 _ batch_split 20.0000_learning_rate 0.0010 |loss_function ContrastiveLoss + margin 0.3000 +reduction mean_0.pt"
#     model = NetSimilarity()
#     print(model.state_dict()['layer1.linear.weight'].size())
#     model.load_state_dict(th.load(PATH))
#     total_data = np.concatenate((test_negative_mask,test_mask,train_mask))
#     neg_data = np.array([x for x in total_data if x[2]==0])
#     pos_data = np.array([x for x in total_data if x[2]==1])
#     print(len(neg_data))
#     print(len(pos_data))
#     acc_pos = evaluate(net, g, g.ndata['vector'], pos_data,"ContrastiveLoss",0.017)
#     acc_neg = evaluate(net, g, g.ndata['vector'], neg_data,"ContrastiveLoss",0.017)
#     return ("Accuracy with possitive: "+ str(acc_pos) + " and negatives: " + str(acc_neg))

In [None]:
def test_similarity(sim):
    total_data = np.concatenate((test_negative_mask,test_mask,train_mask))
    pos_data = np.array([x for x in total_data if x[2]==1])
    embeddings = net(g, g.ndata['vector'])
    ds_embeddings = embeddings[list(map_reverse_order.keys())]
    pdist = nn.PairwiseDistance(p=2)        
    thecos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
    sum_accuracy = 0
    
    for i in map_reverse_order:
        candidate = embeddings[i]
        if sim == "ContrastiveLoss":
            result = pdist(candidate,ds_embeddings)
            closest = torch.topk(result, 4, largest=False).indices[1:]
            check_vector = np.array([x for x in pos_data if x[0]==i or x[1]==i])
            closest_checked = np.array([x for x in check_vector if x[0] in closest or x[1] in closest])
            accuracy = len(closest_checked) / len(closest)
            sum_accuracy = sum_accuracy + accuracy
        if simm == " CosineEmbeddingLoss":
            result = torch.clamp(thecos(candidate,ds_embeddings),min=0)
            closest = torch.topk(result, 4, largest=True).indices[1:]
            check_vector = np.array([x for x in pos_data if x[0]==i or x[1]==i])
            closest_checked = np.array([x for x in check_vector if x[0] in closest or x[1] in closest])
            accuracy = len(closest_checked) / len(closest)
            sum_accuracy = sum_accuracy + accuracy
    return "Accuracy: "+str(sum_accuracy / len(map_reverse_order))
                                
        
    
        
    
    

In [None]:
#test_similarity(loss_function)

## Loss Functions

In [None]:
class ContrastiveLoss(th.nn.Module):
    """
    Contrastive loss function.
    Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    """

    def __init__(self, margin=2.0,reduction='sum'):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        self.reduction = reduction

    def forward(self, output1, output2, label):
        pdist = nn.PairwiseDistance(p=2)
        norm_euclidean = pdist(output1, output2) 
        
        if self.reduction == 'sum':
            loss_contrastive = th.sum( 0.5 * (1+label) * th.pow(norm_euclidean, 2) +
                                      0.5 * (1-label) * th.pow(th.clamp(self.margin -
                                                                        norm_euclidean, min=0.0), 2))
        if self.reduction == 'mean':
            loss_contrastive = th.mean( 0.5 * (1+label) * th.pow(norm_euclidean, 2) +
                                      0.5 * (1-label) * th.pow(th.clamp(self.margin - 
                                                                        norm_euclidean, min=0.0), 2))

        return loss_contrastive
    
class Euclidean(th.nn.Module):
    def __init__(self):
        super(ContrastiveLoss, self).__init__()
        self.pdist = nn.PairwiseDistance(p=2)

    def forward(self, output1, output2, label):
        loss_euclidean = th.mean( 0.5 * (1+label) * euclidean_distance +
                                      0.5 * (1-label) * th.clamp(- euclidean_distance, min=0.0))
        return loss_euclidean

import torch.nn as nn
def get_loss_function(loss,param1=None,param2=None):
    if loss == "CosineEmbeddingLoss":
        red='sum'
        marg=0.2
        if param1 != None:
            margin = param1
        if param2 != None:
            reduction = param2
        return nn.CosineEmbeddingLoss(reduction=red,margin=marg)
    
    if loss == "Euclidean":
        return Euclidean()
    
    if loss == "ContrastiveLoss":
        marg=0.2
        reduction='sum'
        if param1 != None:
            marg = param1
        if param2 != None:
            reduction=param2
        return ContrastiveLoss(marg,reduction)
        
    

## Define NN layers

In [None]:
from dgl.nn.pytorch import GraphConv

class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, g,feature):
        # Creating a local scope so that all the stored ndata and edata
        # (such as the `'h'` ndata below) are automatically popped out
        # when the scope exits.
        with g.local_scope():
            g.ndata['vector'] = feature
            g.update_all(gcn_msg, gcn_reduce)
            h = th.cat([feature,g.ndata['vector']],dim=1)
            g.ndata['vector'] = self.linear(h)
            return g.ndata['vector']

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.layer1 = GCNLayer(600, 300)
        self.layer2 = GCNLayer(600, 300)
        self.layer3 = nn.Linear(300, 300)
    
    def forward(self, g,features):
        x = F.leaky_relu(self.layer1(g,features))
        x = self.layer2(g, x)
        x = th.tanh(self.layer3(x))
        x = (x / th.norm(x))
        return x
    
    def forward_softmax(self, features):
        x = th.tanh(self.layer3(features))
        x = F.log_softmax(self.layer4(x),dim=1)
        return x
    
class Net_Bert(nn.Module):
    def __init__(self):
        super(Net_Bert, self).__init__()
        self.layer1 = GCNLayer(1536, 768)
        self.layer2 = GCNLayer(1536, 768)
        self.layer3 = nn.Linear(768, 768)
    
    def forward(self, g,features):
        x = F.leaky_relu(self.layer1(g,features))
        x = self.layer2(g, x)
        x = th.tanh(self.layer3(x))
        return x
    
    def forward_softmax(self, features):
        x = th.tanh(self.layer3(features))
        x = F.log_softmax(self.layer4(x),dim=1)
        return x    
    

## Training

In [None]:
def train():
    global ep
    global exp_number
    for epoch in range(iterations):
        t0 = time.time()
        net.train()
        for split in train_batch:
            sg = g
            embeddings = net(g, g.ndata['vector'])
            v1,v2,labels = resultSet_train(embeddings,split)
            loss = loss_func(v1,v2, labels)
            optimizer.zero_grad()
            #loss.backward(retain_graph=True)
            loss.backward()
            optimizer.step()

        dur.append(time.time() - t0)
#         acc = evaluate(net, g, g.ndata['vector'], test_mask,loss_function,eval_sim)
        acc = test_similarity(loss_function)
        output = str("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}"
                     .format(ep, loss.item(), acc, np.mean(dur)))
        print(output)
        file_experiment.append(output)
        ep+=1

    write_to_file(file_experiment,"./results/" + hyper_params + "_"+ str(exp_number))
    th.save(net.state_dict(), "./models/" + hyper_params + "_"+ str(exp_number)+".pt")
    print("Model Saved Successfully")
    exp_number+=1

## Config and run training

In [None]:
import time
import numpy as np

file_experiment = []
#th.set_num_threads(2)

#hyperPar
eval_sim = 0.017
batch_splits = 20
lr=1e-3
loss_function = "ContrastiveLoss"
margin=0.3
reduction='mean'
hyper_params = str("Fasttext_threshold_simmetry {:.4f} _ batch_split {:.4f}_learning_rate {:.4f} |\
loss_function {} + margin {:.4f} +reduction {}".format(eval_sim,batch_splits,lr,loss_function,margin,reduction))

#NN
exp_number = 0
net = Net()
optimizer = th.optim.Adam(net.parameters(),lr)
dur = []
loss_func = get_loss_function(loss_function,margin,reduction)
train_batch = np.array_split(train_mask,batch_splits)
ep = 0
iterations = 3

train()

## Deprecated

In [None]:
# import time
# import numpy as np

# file_experiment = []

# #hyperPar
# eval_sim = 0.7
# batch_splits = 1
# lr=1e-3
# loss_function = "CosineEmbeddingLoss+CrossEntroppy"
# margin=0.5
# reduction='mean'
# hyper_params = str("threshold_simmetry {:.4f} _ batch_split {:.4f}_learning_rate {:.4f} |\
# loss_function {} + margin {:.4f} +reduction {}".format(eval_sim,batch_splits,lr,loss_function,margin,reduction))

# #NN
# exp_number = 0
# net = Net()
# optimizer = th.optim.Adam(net.parameters(),lr)
# dur = []
# loss_func = get_loss_function(loss_function)
# train_batch = np.array_split(train_mask,batch_splits)

# for epoch in range(1):
#     t0 = time.time()
#     net.train()
#     for mask in train_batch:
#         embeddings = net.forward(g, g.ndata['vector'])
#         v1,v2,labels = resultSet_train(embeddings,mask)
#         loss = loss_func(v1,v2, labels)
#         optimizer.zero_grad()
#         #loss.backward(retain_graph=True)
#         loss.backward()
#         optimizer.step()

    
#     ##############################
# #     v_softmax,labels_tipo = resultSet_train_softmax(embeddings,train_mask,g.ndata['tipo'])
# #     embeddings_softmax = net.forward_softmax(v_softmax)
# #     loss2 = F.nll_loss(embeddings_softmax, labels_tipo)
# #     loss2.backward()
# #     optimizer.step()
    
    
# #     total_loss = loss2 + loss
# #     total_loss.backward()
# #     optimizer.step()
#     ##############################
    
    
    
#     dur.append(time.time() - t0)
    
#     #acc = evaluate(net, g, embeddings.detach(), test_mask)
#     acc = evaluate(net, g, g.ndata['vector'], test_mask,eval_sim)
#     output = str("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
#             epoch+5, loss.item(), acc, np.mean(dur)))
#     print(output)
#     file_experiment.append(output)

# write_to_file(file_experiment,hyper_params + "_"+ str(exp_number))
# exp_number+=1

## Tests

In [None]:
# cos = th.nn.CosineSimilarity(dim=0, eps=1e-6)
# resultCos = cos(embeddings[576].detach(), embeddings[513].detach())

# result,label = resultSet_eval(g.ndata['vector'],test_mask)

# #result_sum = th.sum(1-(result - label))
# #label
# ds1_set = list(set(df_matching["'dataset1_id'"]))
# ds2_set = list(set(df_matching["'dataset2_id'"]))
# print(len(ds1_set))
# print(len(ds2_set))

# import torch as th
# hola = th.tensor([True, True, False])
# th.sum(hola)

# hola = list(filter(lambda x: x[2] == 1, test_mask))
# print (len(hola)/len(test_mask))

# cos = th.nn.CosineSimilarity(dim=0, eps=1e-6)
# #output = cos(embeddings[495].detach(),embeddings[576].detach())
# output = cos(embeddings[576].detach(),embeddings[574].detach())
# output

# #get data from a specific node
# g.nodes[0].data
# #get data from nodes
# g.ndata
# #another way of accessing data from a node
# g.ndata['tipo'][0]
# g.edges()
# g.has_edge_between(374,17619)

# g.ndata

# for n in g.nodes:
#     print (n.data['tipo'])

In [None]:
# import torch as th
# hola = th.tensor([[1.0,2.0],[1.2,2.0]])
# chau = th.tensor([[-1.0,-2.0],[-1.2,-2.0]])
# res = th.cat([hola,chau],dim=1)
# res

In [None]:
# def normalization(vector):
#     max_v = torch.max(vector)
#     normalized = (vector) / max_v.item()
#     return normalized

import torch.nn as nn
import torch
pdist = nn.PairwiseDistance(p=2)
input1 = torch.randn(1, 5) * 10.0
input2 = torch.randn(3, 5) * 10.0
input1 = input1 / (torch.norm(input1))
input2 = input2 / (torch.norm(input2))
print(input1[0])
print(input2[0])
pdist = nn.PairwiseDistance(p=2)
thecos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
output = torch.clamp(thecos(input1, input2),min=0)
print(output)
tuples = torch.topk(output, 2, largest=True)
tuples.indices[1:]

In [None]:
for i in map_reverse_order:
    print(i)

In [None]:
len(map_reverse_order)