In [10]:
import sys
sys.path.append('../src/')
from tools import get_features
from tools import get_results
from tools import generate_ids_paths
from tools import columns_to_features
from tools import create_graphs
from tools import create_feature_tensor
from train import negative_sampling_3
from train import train_model
from train import predict_all_links
from train import compute_confusion_matrix
from train import metrics
from model import CasanovaModel
import pandas as pd

import torch
import dgl
import json
import random
import numpy as np
import itertools
import re

Using backend: pytorch


# Dataset Preparation and Profiling

In [2]:
path_to_datasets = "../datasets/opendata_small/" # testing the method on the opendata-small configuration
cols_to_ids, files_to_paths = generate_ids_paths(path_to_datasets)

## Profiling

In [31]:
cols_features = columns_to_features(files_to_paths)

In [32]:
feature_tensor = create_feature_tensor(cols_features, cols_to_ids)

## Group datasets per category

In [11]:
cols = list(cols_to_ids.keys())

category_tables = dict()

for c in cols:
    match = re.match(r"([a-z_]+)_([0-9]+)", c[0], re.I)
    
    if match[1] in category_tables:
        category_tables[match[1]].append(c[0])
    else:
        category_tables[match[1]] = [c[0]]

for c,t in category_tables.items():    
    t = list(set(t))
    category_tables[c] = t

# Construct relatedness graphs

## For each category, specify number of datasets to consider

In [13]:
datasets_to_regard = [2,2,2,4,4] # no of samples of tables per category (e.g., for the large datasets, we would pick [10,20,30,..,100])
i = 0
for c,tables in category_tables.items():
    category_tables[c] = random.sample(tables,datasets_to_regard[i])
    i+=1

In [14]:
for b,t in category_tables.items():
    print(c)
    print(len(t))

miller_bus_stops
2
miller_accidents
2
miller_train_stations
2
miller_applicants
4
miller_timetable
4


## Get ground truth of matches among different datasets

In [15]:
ground_truth_filepath = '../ground_truth/matches_opendata.txt'
with open(ground_truth_file, 'r') as fp:
    ground_truth = [(i.split(' ')[0], i.split(' ')[1].rstrip()) for i in fp]

## Construct relatedness graphs (configure silos)

In [16]:
graphs, all_columns, all_cols_ids, all_ids_cols = create_graphs(category_tables, cols_to_ids, 2, feature_tensor.tolist(),ground_truth, 2)

Graph 1 receives datasets from source miller_bus_stops
Graph 1 receives datasets from source miller_accidents
Graph 1 receives datasets from source miller_train_stations


## Get negative samples for each relatedness graph (as a negative graph)

In [17]:
graphs_neg = dict()

for i in range(len(graphs)):
    graphs_neg[i] = negative_sampling_3(graphs[i]) # use negative sampling strategy #3 from the paper

# Train model

In [21]:
model = train_model(graphs, graphs_neg, 150, 300, incremental=True) # train incrementally for 150 epochs per relatedness graph

In epoch 0, loss 0.7187286615371704
In epoch 10, loss 0.27573084831237793
In epoch 20, loss 0.23369793593883514
In epoch 30, loss 0.2230553776025772
In epoch 40, loss 0.2133490890264511
In epoch 50, loss 0.20865681767463684
In epoch 60, loss 0.2059348076581955
In epoch 70, loss 0.20067444443702698
In epoch 80, loss 0.18997253477573395
In epoch 90, loss 0.1781836897134781
In epoch 100, loss 0.1747325211763382
In epoch 110, loss 0.1729383021593094
In epoch 120, loss 0.17084525525569916
In epoch 130, loss 0.16755078732967377
In epoch 140, loss 0.15936782956123352
In epoch 0, loss 0.12378218024969101
In epoch 10, loss 0.1082797646522522
In epoch 20, loss 0.10329711437225342
In epoch 30, loss 0.10078881680965424
In epoch 40, loss 0.09652882814407349
In epoch 50, loss 0.08491155505180359
In epoch 60, loss 0.07701005786657333
In epoch 70, loss 0.0668984204530716
In epoch 80, loss 0.05570992827415466
In epoch 90, loss 0.04667947441339493
In epoch 100, loss 0.04176035150885582
In epoch 110, los

## Apply the model to all nodes in order to embed them

In [22]:
embeddings = dict()

for i in range(len(graphs)):

    embeddings[i] = model.gnn(graphs[i], graphs[i].ndata['feat']).detach()

In [23]:
predict_all_links(all_columns, all_cols_ids, embeddings, ground_truth, model, len(graphs))

Computing predictions between graphs: 0 - 1
Precision: 0.7518796992481203
Recall: 0.6369426751592356
F-score: 0.689655172413793


# Baseline results

## Get  results for a SotA matching method (EmbDI in this example)

In [24]:
baseline_results_filepath = '../baseline_results/EmbDI_opendata_small_results.json'
baseline_results = get_results(baseline_results_filepath, 'EmbDI')

## Compute effectiveness results

In [30]:
count_tp, count_fp, _, count_fn = compute_confusion_matrix(baseline_results, ground_truth)
metrics(count_tp, count_fp, count_fn)

Precision: 0.03615643190948594
Recall: 0.9
F-score: 0.06951998108299835
