## Get 80% train, 20% test

In [1]:
import numpy as np
import pandas as pd

train_mask = pd.read_csv("./train.csv").to_numpy()
test_mask = pd.read_csv("./test.csv").to_numpy()
print("Train samples: "+str(len(train_mask)) + " Test samples: "+str(len(test_mask)))
# nodes_train = np.unique(np.concatenate((train_mask[:,0],train_mask[:,1]))) 
# nodes_test = np.unique(np.concatenate((test_mask[:,0],test_mask[:,1])))

Train samples: 2145 Test samples: 374


## Read graph of metafeatures

In [2]:
import networkx as nx
g_x = nx.read_gpickle("encoded_fasttext.gpickle")
#g = nx.read_gpickle("encoded_features.gpickle")
#g = nx.read_gpickle("siimple.gpickle")
order = 0
for x,n in sorted(g_x.nodes(data=True)):
    t = n['tipo']
    if t == "dataset":
        n['tipo'] = 0
    if t == "feature dataset":
        n['tipo'] = 1
    if t == "literal dataset":
        n['tipo'] = 2
    if t == "attribute":
        n['tipo'] = 3
    if t == "feature attribute":
        n['tipo'] = 4
    if t == "literal attribute":
        n['tipo'] = 5  
    n['order']=order
    order+=1
    
datasets = [x for (x,y) in g_x.nodes(data=True) if y['tipo']==0]
order = [y['order'] for x,y in g_x.nodes(data=True) if y['tipo']==0]
map_order = dict(zip(datasets,order))
map_order['DS_1']

374

## Deep graph library

In [3]:
import dgl
#convert from networkx to graph deep library format
g = dgl.DGLGraph()
#gdl.from_networkx(g,['vector'])
g.from_networkx(g_x,node_attrs=['tipo','vector'], edge_attrs=None)
g_x = None

Using backend: pytorch


In [4]:
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph

gcn_msg = fn.copy_src(src='vector', out='m')
gcn_reduce = fn.sum(msg='m', out='vector')

## Helpers

In [5]:
def resultSet_train(features,mask):
    v1 = []
    v2 = []
    labels = []
    for n in mask:
        v1.append(features[n[0]])
        v2.append(features[n[1]])
        if n[2] == 0:
            n[2] = -1
        labels.append(n[2])
    return th.stack(v1),th.stack(v2),th.tensor(labels)

def resultSet_train_softmax(features,mask,labels):
    v1 = []
    labels_out = [] 
    loaded = []
    for n in mask:
        if n[0] not in loaded:
            loaded.append(n[0])
            v1.append(features[n[0]])
            labels_out.append(labels[n[0]])
        if n[1] not in loaded:
            loaded.append(n[1])
            v1.append(features[n[1]])
            labels_out.append(labels[n[0]])
    return th.stack(v1),th.tensor(labels_out)

from scipy.spatial.distance import cosine
def resultSet_eval(features,mask,sim=0.7):
    indices = []
    labels = []
    for n in mask:
        cos = th.nn.CosineSimilarity(dim=0, eps=1e-6)
        resultCos = cos(features[n[0]],features[n[1]])
            
        if resultCos.item() >= sim:
            out = th.tensor(1)
        else:
            out = th.tensor(0)
        
        indices.append(out)
        labels.append(n[2])
    return th.tensor(indices),th.tensor(labels)

def evaluate(model, g, features, mask,eval_sim):
    model.eval()
    with th.no_grad():
        embeddings = model.forward(g, features)
        indices , labels = resultSet_eval(embeddings,mask,eval_sim)
        correct = th.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

    
def write_to_file(text,id_file):
    MyFile=open(id_file+'.txt','w')
    MyFile.writelines(text)
    MyFile.close()
    
#print(print_experiment_loss())

In [6]:
def print_experiment_loss():
    v12,labels2 = resultSet_eval(embeddings.detach(),test_mask,0.7)
    total = len(labels2)
    return "acc= " + str(th.sum(v12 == labels2).item() / total)

## Define NN layers

In [7]:
class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, g,feature):
        # Creating a local scope so that all the stored ndata and edata
        # (such as the `'h'` ndata below) are automatically popped out
        # when the scope exits.
        with g.local_scope():
            g.ndata['vector'] = feature
            g.update_all(gcn_msg, gcn_reduce)
            h = th.cat([feature,g.ndata['vector']],dim=1)
            g.ndata['vector'] = self.linear(h)
            return g.ndata['vector']
#             g.ndata['vector'] = feature
#             g.update_all(gcn_msg, gcn_reduce)
#             h = g.ndata['vector']
#             return self.linear(h)
        
# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
#         self.layer1 = GCNLayer(300, 150)
#         self.layer2 = GCNLayer(150, 100)
    
#     def forward(self, g,features):
#          #x = F.relu(self.layer1(g, features))
#         x = F.leaky_relu(self.layer1(g, features))
#         #x = F.relu(self.layer2(g, x))
#         x = self.layer2(g, x)
#         return x

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = GCNLayer(600, 300)
        self.layer2 = GCNLayer(600, 300)
        self.layer3 = nn.Linear(300, 300)
    
    def forward(self, g,features):
        x = F.leaky_relu(self.layer1(g,features))
        x = self.layer2(g, x)
        x = th.tanh(self.layer3(x))
        return x
    
    def forward_softmax(self, features):
        x = th.tanh(self.layer3(features))
        x = F.log_softmax(self.layer4(x),dim=1)
        return x
    

## Loss Functions

In [8]:
import torch.nn as nn
def get_loss_function(loss,param1=None,param2=None):
    if loss == "CosineEmbeddingLoss":
        red='sum'
        marg=0.0
        if param1 != None:
            reduction = param1
        if param2 != None:
            margin = param2
        return nn.CosineEmbeddingLoss(reduction=red,margin=marg)
    if loss == "CrossEntroppy":
        red='sum'
        marg=0.0
        if param1 != None:
            reduction = param1
        if param2 != None:
            margin = param2
        return nn.CosineEmbeddingLoss(reduction=red,margin=marg)
        
    

## Train Loss = cos_similarity

In [9]:
import time
import numpy as np

file_experiment = []

#hyperPar
eval_sim = 0.7
batch_splits = 1
lr=1e-3
loss_function = "CosineEmbeddingLoss"
margin=0.5
reduction='mean'
hyper_params = str("threshold_simmetry {:.4f} _ batch_split {:.4f}_learning_rate {:.4f} |\
loss_function {} + margin {:.4f} +reduction {}".format(eval_sim,batch_splits,lr,loss_function,margin,reduction))

#NN
exp_number = 0
net = Net()
optimizer = th.optim.Adam(net.parameters(),lr)
dur = []
loss_func = get_loss_function(loss_function)
train_batch = np.array_split(train_mask,batch_splits)

for epoch in range(1):
    t0 = time.time()
    net.train()
    for mask in train_batch:
        embeddings = net.forward(g, g.ndata['vector'])
        v1,v2,labels = resultSet_train(embeddings,mask)
        loss = loss_func(v1,v2, labels)
        optimizer.zero_grad()
        #loss.backward(retain_graph=True)
        loss.backward()
        optimizer.step()
    
    dur.append(time.time() - t0)
    acc = evaluate(net, g, g.ndata['vector'], test_mask,eval_sim)
    output = str("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
            epoch+5, loss.item(), acc, np.mean(dur)))
    print(output)
    file_experiment.append(output)

write_to_file(file_experiment,hyper_params + "_"+ str(exp_number))
exp_number+=1

Epoch 00005 | Loss 1594.2313 | Test Acc 0.4706 | Time(s) 2181.4156


## Train Loss = cos_similarity + cross_entroppy

In [None]:
import time
import numpy as np

file_experiment = []

#hyperPar
eval_sim = 0.7
batch_splits = 1
lr=1e-3
loss_function = "CosineEmbeddingLoss+CrossEntroppy"
margin=0.5
reduction='mean'
hyper_params = str("threshold_simmetry {:.4f} _ batch_split {:.4f}_learning_rate {:.4f} |\
loss_function {} + margin {:.4f} +reduction {}".format(eval_sim,batch_splits,lr,loss_function,margin,reduction))

#NN
exp_number = 0
net = Net()
optimizer = th.optim.Adam(net.parameters(),lr)
dur = []
loss_func = get_loss_function(loss_function)
train_batch = np.array_split(train_mask,batch_splits)

for epoch in range(1):
    t0 = time.time()
    net.train()
    for mask in train_batch:
        embeddings = net.forward(g, g.ndata['vector'])
        v1,v2,labels = resultSet_train(embeddings,mask)
        loss = loss_func(v1,v2, labels)
        optimizer.zero_grad()
        #loss.backward(retain_graph=True)
        loss.backward()
        optimizer.step()

    
    ##############################
#     v_softmax,labels_tipo = resultSet_train_softmax(embeddings,train_mask,g.ndata['tipo'])
#     embeddings_softmax = net.forward_softmax(v_softmax)
#     loss2 = F.nll_loss(embeddings_softmax, labels_tipo)
#     loss2.backward()
#     optimizer.step()
    
    
#     total_loss = loss2 + loss
#     total_loss.backward()
#     optimizer.step()
    ##############################
    
    
    
    dur.append(time.time() - t0)
    
    #acc = evaluate(net, g, embeddings.detach(), test_mask)
    acc = evaluate(net, g, g.ndata['vector'], test_mask,eval_sim)
    output = str("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
            epoch+5, loss.item(), acc, np.mean(dur)))
    print(output)
    file_experiment.append(output)

write_to_file(file_experiment,hyper_params + "_"+ str(exp_number))
exp_number+=1

## Tests

In [None]:
result,label = resultSet_eval(g.ndata['vector'],test_mask)

In [None]:
#result_sum = th.sum(1-(result - label))
#label
ds1_set = list(set(df_matching["'dataset1_id'"]))
ds2_set = list(set(df_matching["'dataset2_id'"]))
print(len(ds1_set))
print(len(ds2_set))

In [None]:
import torch as th
hola = th.tensor([True, True, False])
th.sum(hola)

In [None]:
cos = th.nn.CosineSimilarity(dim=0, eps=1e-6)
resultCos = cos(embeddings[576].detach(), embeddings[513].detach())

In [None]:
hola = list(filter(lambda x: x[2] == 1, test_mask))
print (len(hola)/len(test_mask))

In [None]:
cos = th.nn.CosineSimilarity(dim=0, eps=1e-6)
#output = cos(embeddings[495].detach(),embeddings[576].detach())
output = cos(embeddings[576].detach(),embeddings[574].detach())
output

In [None]:
#get data from a specific node
g.nodes[0].data
#get data from nodes
g.ndata
#another way of accessing data from a node
g.ndata['tipo'][0]

In [None]:
g.edges()

In [None]:
g.has_edge_between(374,17619)

In [None]:
g.ndata

In [None]:
for n in g.nodes:
    print (n.data['tipo'])