# Get dataset with ~80% train, ~20% test

In [1]:
import numpy as np
import pandas as pd
import step3_train_test_split as ds_split
file_name = "openml_203ds_datasets_matching.csv"
ds_split.split_ds(file_name)
train_mask = pd.read_csv("./datasets/"+file_name+"_train.csv").to_numpy()
test_mask = pd.read_csv("./datasets/"+file_name+"_test.csv").to_numpy()
print("Train samples: "+str(len(train_mask)) + " Test samples: "+str(len(test_mask)))

Train/Test split done
Train samples: 1809 Test samples: 272


# Read graph of metafeatures

In [None]:
import networkx as nx
g_x = nx.read_gpickle("./word_embeddings/encoded_fasttext.gpickle")
#g_x = nx.read_gpickle("./word_embeddings/encoded_bert.gpickle")
ds_order = 0
for x,n in sorted(g_x.nodes(data=True)):
    t = n['tipo']
    if t == "dataset":
        n['tipo'] = 0
    if t == "feature dataset":
        n['tipo'] = 1
    if t == "literal dataset":
        n['tipo'] = 2
    if t == "attribute":
        n['tipo'] = 3
    if t == "feature attribute":
        n['tipo'] = 4
    if t == "literal attribute":
        n['tipo'] = 5  
    n['ds_order']=ds_order
    ds_order+=1
    
datasets = [x for (x,y) in g_x.nodes(data=True) if y['tipo']==0]
ds_order = [y['ds_order'] for x,y in g_x.nodes(data=True) if y['tipo']==0]
map_ds = dict(zip(datasets,ds_order))
#map_reverse_ds_order = dict(zip(ds_order,datasets))
map_ds['DS_1']

for mask in train_mask:
    mask[0] = map_ds["DS_"+str(mask[0])]
    mask[1] = map_ds["DS_"+str(mask[1])]
    if mask[2] == 0:
        mask[2] = -1
for mask in test_mask:
    mask[0] = map_ds["DS_"+str(mask[0])]
    mask[1] = map_ds["DS_"+str(mask[1])]

### Export graph to deep graph library

In [None]:
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
#convert from networkx to graph deep library format
g = dgl.DGLGraph()
g.from_networkx(g_x,node_attrs=['tipo','vector','ds_order'], edge_attrs=None)
g_x = None

# Training

### Evaluation methods

In [None]:
# Accuracy based on thresholds of distance (e.g. cosine > 0.8 should be a positive pair)
def threshold_acc(model, g, features, mask,loss):
    indices = []
    labels = []
    z1, z2 = model(g,features,mask[:,0],mask[:,1])
    
    #dist() | m - dist()
    if loss == "ContrastiveLoss" or loss == "Euclidean":
        pdist = th.nn.PairwiseDistance(p=2)        
        result = pdist(z1,z2)
        for i in range(len(result)):
            r = result[i]
            m = mask[i][2]
            #print('{:.5f} output: {:.5f}'.format(r,m))
            if r.item() <= 0.2:
                indices.append(1.0)
            else:
                indices.append(0.0)          
        indices_tensor = th.tensor(indices)
        labels_tensor = th.tensor(mask[:,2])
        
    #1 - cos() | max(0,cos() - m)
    if loss == "CosineEmbeddingLoss":
        cos = th.nn.CosineSimilarity(dim=1, eps=1e-6)
        result = cos(z1,z2)
        for i in range(len(result)):
            r = result[i]
            m = mask[i][2]
            #print('{:.5f} output: {:.5f}'.format(r,m))
            if r.item() >= 0.8:
                indices.append(1.0)
            else:
                indices.append(0.0)
        indices_tensor = th.tensor(indices)
        labels_tensor = th.tensor(mask[:,2])
    
    correct = th.sum(indices_tensor == labels_tensor)
    return correct.item() * 1.0 / len(labels_tensor)

# Accuracy based on nearest neighboor (e.g. the nearest node should be a positive pair)
def ne_ne_acc(model, g, features, mask,loss):
    
    total_data = np.concatenate((test_mask,train_mask))
    pos_samples = np.array([x for x in total_data if x[2]==1])
    #get embeddings of datasets that are within the positive pairs
    ds_concat = np.concatenate((pos_samples[:,0],pos_samples[:,1]))
    ds_pos_samples_indices = np.unique(ds_concat)
    ds_embeddings,ds_pos_samples = model(g, features,ds_order,ds_pos_samples_indices)
    
    sum_accuracy = 0
    for i in range(len(ds_pos_samples_indices)):
        candidate = ds_pos_samples[i]
        #dist() | m - dist()
        if loss == "ContrastiveLoss":
            pdist = th.nn.PairwiseDistance(p=2)        
            result = pdist(candidate,ds_embeddings)
            largest = False
        #1 - cos() | max(0,cos() - m)
        if loss == "CosineEmbeddingLoss":
            thecos = th.nn.CosineSimilarity(dim=1, eps=1e-6)
            result = thecos(candidate.reshape(1,len(candidate)),ds_embeddings)
            largest = True
        
        result_index = th.topk(result, 2, largest=largest).indices[-1]
        closest_node_index = ds_order[result_index]
        
        check_relation_nodes = np.array([x for x in pos_samples 
                                         if (x[0]==ds_pos_samples_indices[i] and x[1]==closest_node_index) or 
                                         (x[1]==ds_pos_samples_indices[i] and x[0]==closest_node_index)])
        
        if len(check_relation_nodes) > 0:
            sum_accuracy += 1

    return sum_accuracy / len(ds_pos_samples_indices)    

def evaluate(model, g, features, mask,loss):
    model.eval()
    with th.no_grad():
        #naive way of testing accuracy 
        acc = threshold_acc(model, g, features, mask,loss)
        #accuracy based on 1-NN 
        acc2 = ne_ne_acc(model, g, features, mask,loss)
        return acc,acc2

### Train loop

In [None]:
import time
import numpy as np 
def train(training,iterations):
    dur = []
    
    ## training.splits indicates number of sets to split, not batch size!
    train_batch = np.array_split(train_mask,training.batch_splits)
    
    #specify number of threads for the training
    #th.set_num_threads(2)
    
    for epoch in range(iterations):
        #model train mode
        training.net.train()
        t0 = time.time()
        epoch_loss = 0
        
        #forward_backward positive batch sample
        for split in train_batch:
            z1,z2 = training.net(g, g.ndata['vector'],split[:,0],split[:,1])
            loss = training.loss(z1,z2, th.tensor(split[:,2]))
            training.optimizer.zero_grad()
            #loss.backward(retain_graph=True)
            loss.backward()
            training.optimizer.step()
            epoch_loss += loss.item()
        
        epoch_loss = epoch_loss / training.batch_splits

        #runtime
        t = time.time() - t0
        dur.append(t)
        
        #total time accumulation for this model
        training.runtime_seconds+=t
        
        #accuracy
        acc,acc2 = evaluate(training.net, g, g.ndata['vector'], test_mask,training.loss_name)
        
        #create log
        output = {}
        output['epoch'] = training.epochs_run
        output['loss'] = float('%.5f'% (epoch_loss))
        output['acc'] = float('%.5f'% (acc))
        output['acc2'] = float('%.5f'% (acc2))
        output['time_epoch'] = float('%.5f'% (np.mean(dur)))
        output['time_total'] = float('%.5f'% (training.runtime_seconds))
        training.log.append(output)
        training.epochs_run+=1
        print(str(output))
        
    #write results and save model to files
    training.save_state()

### Config and run training
### NN architectures: 
{<br>
    "0": "Bert_300", <br>
    "1": "Bert_300_300_200", <br>
    "2": "Bert_768", <br>
    "3": "Fasttext_150", <br>
    "4": "Fasttext_150_150_100", <br>
    "5": "Fasttext_300" <br>
}
### Loss functions: 
{<br>
    "0": "ContrastiveLoss", <br>
    "1": "CosineEmbeddingLoss", <br>
    "2": "Euclidean" <br>
}
### Example to define architecture and loss
<b>import step3_gcn_nn_concatenate as gcn_nn</b> <br>
<b>import step3_gcn_loss as gcn_loss</b> <br>
print(gcn_nn.get_options()) #list of options<br>
print(gcn_loss.get_options()) #list of options<br>

### Load training class to save/load/train experiments:
<b>import step3_gcn_train as gcn_train</b>

In [None]:
import step3_gcn_nn_concatenate as gcn_nn
import step3_gcn_loss as gcn_loss
import step3_gcn_training as gcn_training

# #load model from path
# training = gcn_training.Training()
# training.load_state(path="./models/[file_name].pt")
# train(training,iterations=N)

# #train new model and specify parameters
# training = gcn_training.Training()
# training.set_training(
#             net_name= gcn_nn.get_option_name(),  #_of_option for NN architecture
#             batch_splits= ,#_of_sets(this will (give dataset / batch_splits) size of batch
#             lr= , #learning rate for training (e.g. 1e-3 )
#             loss_name=gcn_loss.get_option_name() #_of_option for loss ,
#             loss_parameters=) #loss function parameters separated by '+' e.g. for cosine and contrastive "0.0+mean" 
# train(training,iterations=N)

### Test suite

In [None]:
#Train with contrastive loss
#train new model and specify parameters
training = gcn_training.Training()
training.set_training(
            net_name= gcn_nn.get_option_name(3),
            batch_splits=28,
            lr=1e-3,
            loss_name=gcn_loss.get_option_name(0),
            loss_parameters="1.0+mean")
train(training,iterations=100)

#train new model and specify parameters
training = gcn_training.Training()
training.set_training(
            net_name= gcn_nn.get_option_name(3),
            batch_splits=28,
            lr=1e-3,
            loss_name=gcn_loss.get_option_name(0),
            loss_parameters="0.7+mean")
train(training,iterations=100)

#train new model and specify parameters
training = gcn_training.Training()
training.set_training(
            net_name= gcn_nn.get_option_name(3),
            batch_splits=28,
            lr=1e-3,
            loss_name=gcn_loss.get_option_name(0),
            loss_parameters="0.5+mean")
train(training,iterations=100)

#Train with cosine loss
training = gcn_training.Training()
#train new model and specify parameters
training.set_training(
            net_name= gcn_nn.get_option_name(3),
            batch_splits=28,
            lr=1e-3,
            loss_name=gcn_loss.get_option_name(1),
            loss_parameters="0.0+mean")
train(training,iterations=100)

#train new model and specify parameters
training = gcn_training.Training()
training.set_training(
            net_name= gcn_nn.get_option_name(3),
            batch_splits=28,
            lr=1e-3,
            loss_name=gcn_loss.get_option_name(1),
            loss_parameters="0.3+mean")
train(training,iterations=100)

#train new model and specify parameters
training = gcn_training.Training()
training.set_training(
            net_name= gcn_nn.get_option_name(3),
            batch_splits=28,
            lr=1e-3,
            loss_name=gcn_loss.get_option_name(1),
            loss_parameters="0.5+mean")
train(training,iterations=100)