In [15]:
trackdirpath="/content/data_oaei/"
trackdirname="all-v2/"
# 0. Label
# 1. Verbalize classes (children, parents) and properties (domain, range) (with sequence)
# 2. Verbalize classes (children, parents) and properties (domain, range) (with pattern)
# 3. Verbalize classes (children, parents) and properties (domain, range) (with pattern en-fr-es)
verbalization_function_name_list = ['label', 'sequence', 'pattern_en', 'pattern_en-fr-es']
verbalization_function = 2

## 1. Mount Google Drive files and download multifarm dataset

> **Note**: Please, change `data_dirname` to path in drive where `Beca colaboración (2021-22)_compartido/data/'` dir is.

> If you don't want to use Google Drive data you can always ommit the execution of the next cell and the dataset will be download to the Colab session storage 

In [16]:
data_dirname = '/MyDrive/Beca colaboración (2021-22)_compartido/data_oaei/'

![ ! -d '/content/data_oaei/' ] && ln -s "/content/drive$data_dirname" '/content'

In [17]:
!pip install rdflib --quiet
!pip install owlready2 --quiet
!pip install transformers --quiet
!pip install sentence-transformers --quiet

In [18]:
import os
from collections import defaultdict
import random
from itertools import chain
import time
import numpy as np
import math

In [19]:
import torch
from torch import nn, tensor
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

In [20]:
from transformers import AutoTokenizer, AutoModel

In [21]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

In [22]:
from collections import defaultdict
from datetime import datetime
import logging
import sys
# from rdflib import Graph, RDFS, URIRef
from owlready2 import get_ontology, IRIS, sync_reasoner
from sentence_transformers import SentenceTransformer, util
from torch.nn.functional import relu
# from transformers import AutoModel, AutoTokenizer  # TODO remove unnecesary deps
import numpy as np
from scipy.optimize import linear_sum_assignment

In [23]:
from owlready2 import get_ontology, IRIS
from rdflib import Graph

In [24]:
import json

## 3. Get alignments

In [25]:
def verbalize_label_sequence(init_label, list_neighbors_one, list_neighbors_two):
  verbalization = init_label
  for label in list_neighbors_one:
      if not init_label == label:
          verbalization = verbalization + ", " + label
  for label in list_neighbors_two:
      if not init_label == label:
          verbalization = verbalization + ", " + label

  return verbalization


def verbalize_label_pattern_class(init_label, children, parents, language):
  if verbalization_function == 3:
      if language == "en":
          pattern = " is a "
      elif language == "es":
          pattern = " es un "
      elif language == "fr":
          pattern = " est un "
      else:
          pattern = " is a "
  else:
      pattern = " is a "

  verbalization = init_label
  for label in children:
      if not init_label == label:
          verbalization = verbalization + ", " + label + pattern + init_label
  for label in parents:
      if not init_label == label:
          verbalization = verbalization + ", " + init_label + pattern + label

  return verbalization

def verbalize_label_pattern_property(init_label, domains, ranges, language):
  
  pattern_domain = " has domain "
  pattern_range = " has range "

  verbalization = init_label
  for label in domains:
      if not init_label == label:
          verbalization = verbalization + ", " + init_label + pattern_domain + label
  for label in ranges:
      if not init_label == label:
          verbalization = verbalization + ", " + init_label + pattern_range + label

  return verbalization

def verbalize_neighbors(onto, iri, init_label, language, isClass=False, isProperty=False):

  if isClass:
    children = []
    parents = []

    for parent in onto.get_parents_of(IRIS[iri]):
        try:
            if len(parent.label) > 0:
                parents.append(parent.label[0])
        except AttributeError:
            continue

    for child in onto.get_children_of(IRIS[iri]):
        try:
            if len(child.label) > 0:
                children.append(child.label[0])
        except AttributeError:
            continue
    
    list_neighbors_one = children
    list_neighbors_two = parents
  
  elif isProperty:
    domains = []
    ranges = []

    for domain in IRIS[iri].domain:
        try:
            if len(domain.label) > 0:
                domains.append(domain.label[0])
        except AttributeError:
            continue

    for range in IRIS[iri].range:
        try:
            if len(range.label) > 0:
                ranges.append(range.label[0])
        except AttributeError:
            continue
    
    list_neighbors_one = domains
    list_neighbors_two = ranges

  if verbalization_function == 1:
      verbalization = verbalize_label_sequence(
          init_label, list_neighbors_one, list_neighbors_two)
  elif verbalization_function == 2 or verbalization_function == 3:
      if isClass:
        verbalization = verbalize_label_pattern_class(
            init_label, list_neighbors_one, list_neighbors_two, language)
      elif isProperty:
        verbalization = verbalize_label_pattern_property(
            init_label, list_neighbors_one, list_neighbors_two, language)

  return verbalization


def get_iri_label_lists(onto, generator, isClass=False, isProperty=False):
  iri_label_dict=defaultdict(str)

  # TODO check for empty labels
  for item in generator:
      if len(item.label) < 1:
          continue
      label = item.label[0]
      language = item.label[0].lang
      iri = item.iri

      if verbalization_function > 0:
          verbalization = verbalize_neighbors(
              onto, iri, label, language, isClass, isProperty)
      else:
          verbalization = label

      iri_label_dict[iri] = verbalization

  return iri_label_dict


def mappings_dictionary_default_value():
    return { "alignmententity1": "", "alignmententity2": "", "alignmentmeasure": 0.0}

In [26]:
total_examples_list = []

total_len = len (os.listdir(trackdirpath+trackdirname))
i = 0

for alignmentdir in os.listdir(trackdirpath+trackdirname):
  
  fileprefix = "file://" + trackdirpath + trackdirname + alignmentdir 

  source_url = fileprefix + "/source.rdf"
  target_url = fileprefix + "/target.rdf"
  reference_url = fileprefix + "/reference.rdf"

  source_onto = get_ontology(source_url).load()
  target_onto = get_ontology(target_url).load()
  
  # print(f"Read source with {len(list(source_onto.classes()))} classes and {len(list(source_onto.properties()))} properties.")
  # print(f"Read target with {len(list(target_onto.classes()))} classes and {len(list(target_onto.properties()))} properties.")

  source_class_iri_label_dict = get_iri_label_lists(
        source_onto, source_onto.classes(), isClass=True)

  target_class_iri_label_dict = get_iri_label_lists(
      target_onto, target_onto.classes(), isClass=True)

  source_properties_iri_label_dict = get_iri_label_lists(
      source_onto, source_onto.properties(), isProperty=True)

  target_properties_iri_label_dict = get_iri_label_lists(
      target_onto, target_onto.properties(), isProperty=True)

  # Combine labels
  source_class_iri_label_dict.update(source_properties_iri_label_dict)
  source_iri_label_dict = source_class_iri_label_dict
  target_class_iri_label_dict.update(target_properties_iri_label_dict)
  target_iri_label_dict = target_class_iri_label_dict

  # print(source_iri_list)
  # print(source_label_list)
  # print(target_iri_list)
  # print(target_label_list)

  g = Graph()
  g.parse(reference_url)
  g = list(g)

  root_node_bnode = [item for item in g if item[2].toPython().split('/')[-1] == "alignmentAlignment"][0][0]
  g = [item for item in g if item[0] != root_node_bnode]

  equivalences_dictionary = defaultdict(mappings_dictionary_default_value)


  for ele in g:
    bnode_key = ele[0].toPython()
    tuple_rel = ele[1].toPython().split('/')[-1]
    if (tuple_rel == "alignmententity1"): 
      iri=ele[2].toPython()
      label=source_iri_label_dict[iri]
      tuple_value = label
    elif (tuple_rel == "alignmententity2"):
      iri=ele[2].toPython()
      label=target_iri_label_dict[iri]
      tuple_value = label
    elif (tuple_rel == "alignmentmeasure"):
      tuple_value = float(ele[2].toPython())
    else:
      continue
    equivalences_dictionary[bnode_key][tuple_rel]=tuple_value

  examples_list = [ {'texts': [equivalences_dictionary[equivalence]['alignmententity1'], equivalences_dictionary[equivalence]['alignmententity2']], 'label': equivalences_dictionary[equivalence]['alignmentmeasure']} for equivalence in equivalences_dictionary ]

  total_examples_list = total_examples_list + examples_list

  print(f'{i}/{total_len}')
  i=i+1

0/1125
1/1125
2/1125
3/1125
4/1125
5/1125
6/1125
7/1125
8/1125
9/1125
10/1125
11/1125
12/1125
13/1125
14/1125
15/1125
16/1125
17/1125
18/1125
19/1125
20/1125
21/1125
22/1125
23/1125
24/1125
25/1125
26/1125
27/1125
28/1125
29/1125
30/1125
31/1125
32/1125
33/1125
34/1125
35/1125
36/1125
37/1125
38/1125
39/1125
40/1125
41/1125
42/1125
43/1125
44/1125
45/1125
46/1125
47/1125
48/1125
49/1125
50/1125
51/1125
52/1125
53/1125
54/1125
55/1125
56/1125
57/1125
58/1125
59/1125
60/1125
61/1125
62/1125
63/1125
64/1125
65/1125
66/1125
67/1125
68/1125
69/1125
70/1125
71/1125
72/1125
73/1125
74/1125
75/1125
76/1125
77/1125
78/1125
79/1125
80/1125
81/1125
82/1125
83/1125
84/1125
85/1125
86/1125
87/1125
88/1125
89/1125
90/1125
91/1125
92/1125
93/1125
94/1125
95/1125
96/1125
97/1125
98/1125
99/1125
100/1125
101/1125
102/1125
103/1125
104/1125
105/1125
106/1125
107/1125
108/1125
109/1125
110/1125
111/1125
112/1125
113/1125
114/1125
115/1125
116/1125
117/1125
118/1125
119/1125
120/1125
121/1125
122/1125
123

In [27]:
print(f'filename: {trackdirpath+trackdirname[:-1]+"_total_examples_"+verbalization_function_name_list[verbalization_function]+".json", "w"}')

filename: ('/content/data_oaei/all-v2_total_examples_pattern_en.json', 'w')


In [28]:
count = 0
total_examples_list_non_empty = []

for align in total_examples_list:
  if align['texts'][0] == "" or align['texts'][1] == "":
    count = count + 1
  else:
    total_examples_list_non_empty.append(align)

print(f"removed {count} alignments")

with open(trackdirpath+trackdirname[:-1]+'_total_examples_'+verbalization_function_name_list[verbalization_function]+".json", "w") as infile:
  json.dump(total_examples_list_non_empty, infile)

len(total_examples_list_non_empty)

removed 269 alignments


35127