In [24]:
trackdirpath="/content/data_oaei/"
trackdirname="all-v2/"
# 0. Label
# 1. Verbalize classes (children, parents) and properties (domain, range) (with sequence)
# 2. Verbalize classes (children, parents) and properties (domain, range) (with pattern)
# 3. Verbalize classes (children, parents) and properties (domain, range) (with pattern en-fr-es)
verbalization_function_name_list = ['label', 'sequence', 'pattern_en', 'pattern_en-fr-es']
verbalization_function = 2

## 1. Mount Google Drive files and download multifarm dataset

> **Note**: Please, change `data_dirname` to path in drive where `Beca colaboración (2021-22)_compartido/data/'` dir is.

> If you don't want to use Google Drive data you can always ommit the execution of the next cell and the dataset will be download to the Colab session storage 

In [25]:
data_dirname = '/MyDrive/Beca colaboración (2021-22)_compartido/data_oaei/'

![ ! -d '/content/data_oaei/' ] && ln -s "/content/drive$data_dirname" '/content'

In [26]:
models_dirname = '/MyDrive/Beca colaboración (2021-22)_compartido/models/'

![ ! -d '/content/models/' ] && ln -s "/content/drive$models_dirname" '/content'

## 2. Install and import libraries

In [27]:
!pip install rdflib --quiet
!pip install owlready2 --quiet
!pip install transformers --quiet
!pip install sentence-transformers --quiet

In [28]:
import os
from collections import defaultdict
import random
from itertools import chain
import time
import numpy as np
import math

In [29]:
import torch
from torch import nn, tensor
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

In [30]:
from transformers import AutoTokenizer, AutoModel

In [31]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator

In [32]:
from collections import defaultdict
from datetime import datetime
import logging
import sys
# from rdflib import Graph, RDFS, URIRef
from owlready2 import get_ontology, IRIS, sync_reasoner
from sentence_transformers import SentenceTransformer, util
from torch.nn.functional import relu
# from transformers import AutoModel, AutoTokenizer  # TODO remove unnecesary deps
import numpy as np
from scipy.optimize import linear_sum_assignment

In [33]:
from owlready2 import get_ontology, IRIS
from rdflib import Graph

In [34]:
import json

## 3. Finetune

In [35]:
print(f'filename: { trackdirpath+trackdirname[:-1] + "_total_examples_" + verbalization_function_name_list[verbalization_function] + ".json", "w" }')

filename: ('/content/data_oaei/all-v2_total_examples_pattern_en.json', 'w')


In [36]:
with open(trackdirpath+trackdirname[:-1]+"_total_examples_"+verbalization_function_name_list[verbalization_function]+".json", "r") as json_file:
  positive_examples_list = json.load(json_file)

In [37]:
positive_len = len(positive_examples_list)
negative_len = len(positive_examples_list)

modelname="sentence-transformers/distiluse-base-multilingual-cased-v2"

model_save_path = 'models/model_sentence-transformers_distiluse-base-multilingual-cased-v2_50-50_' + trackdirname[:-1] + '_' + verbalization_function_name_list[verbalization_function] + "_oaei_final"

print(model_save_path)

num_epochs = 3
train_batch_size = 64

models/model_sentence-transformers_distiluse-base-multilingual-cased-v2_50-50_all-v2_pattern_en_oaei_final


In [38]:
total_samples = []

print(f"positive_len: {positive_len}")
print(f"negative_len: {negative_len}")
# Add positives
for sample in positive_examples_list:
  total_samples.append(InputExample(texts=sample['texts'], label=sample['label']))

# Add negatives
for i in range(negative_len):
  source_idx = random.randint(0, positive_len-1)
  target_idx = random.randint(0, positive_len-1)
  total_samples.append(InputExample(texts=[positive_examples_list[source_idx]['texts'][0],positive_examples_list[target_idx]['texts'][1]], label=0.0))

random.shuffle(total_samples)

total_len = len(total_samples)
print(f"total_len:    {total_len}")

positive_len: 35127
negative_len: 35127
total_len:    70254


In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [40]:
model = SentenceTransformer(modelname)

In [41]:
test_samples = total_samples[:math.ceil(len(total_samples)*0.3)]
train_samples = total_samples[math.ceil(len(total_samples)*0.3):]

In [42]:
pre_test_evaluator = BinaryClassificationEvaluator.from_input_examples(test_samples, name='pre-test')
pre_test_evaluator(model)

0.9633378978940054

In [43]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [44]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
print("Warmup-steps: {}".format(warmup_steps))

Warmup-steps: 231


In [45]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          evaluator=pre_test_evaluator,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/769 [00:00<?, ?it/s]

Iteration:   0%|          | 0/769 [00:00<?, ?it/s]

Iteration:   0%|          | 0/769 [00:00<?, ?it/s]

In [46]:
model = SentenceTransformer(model_save_path)
test_evaluator = BinaryClassificationEvaluator.from_input_examples(test_samples, name='post-test')
test_evaluator(model, output_path=model_save_path)

0.9888774235268183