In [1]:
#python -u OWL2Vec_Plus.py --walker wl --walk_depth 4 --URI_Doc yes --Lit_Doc yes --Embed_Out_URI no --Embed_Out_Words yes


In [2]:
import re
import sys
import random
import gensim
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle as pk
import multiprocessing
from pathlib import Path
from nltk import word_tokenize
from owl2vec_star.lib.Evaluator import Evaluator
from owl2vec_star.lib.RDF2Vec_Embed import get_rdf2vec_walks

In [3]:
class AttributeDict(dict):
    def __getattr__(self, attr):
        return self[attr]

    def __setattr__(self, attr, value):
        self[attr] = value

# Usage
ROOT_DIR = Path.cwd().parent / "data" / "owl2vec"
FLAGS = AttributeDict()
FLAGS['onto_file'] = str(ROOT_DIR / "foodon-merged.train.owl")
FLAGS['train_file'] = ROOT_DIR / "train.csv"
FLAGS['valid_file'] = ROOT_DIR / "valid.csv"
FLAGS['test_file'] = ROOT_DIR / "test.csv"
FLAGS['class_file'] = ROOT_DIR / "classes.txt"
FLAGS['inferred_ancestor_file'] = ROOT_DIR / "inferred_ancestors.txt"
FLAGS["embedsize"] = 100

FLAGS["URI_Doc"] ="yes"
FLAGS["Lit_Doc"] ="yes"
FLAGS["Mix_Doc"] ="yes"
FLAGS["Mix_Type"] ="random"
FLAGS["Embed_Out_URI"] ="yes"
FLAGS["Embed_Out_Words"] ="yes"

FLAGS["input_type"] ="concatenate"
FLAGS["walk_depth"] = 4
FLAGS["walker"] ="wl"
FLAGS["axiom_file"] =ROOT_DIR / 'axioms.txt'
FLAGS["annotation_file"] =ROOT_DIR / 'annotations.txt'

# addional flags for save the embeddings
EMB_PATH = Path.cwd() / "save_owl2vec_weights" / "owl2vec3.pkl"



classes = [line.strip() for line in open(FLAGS.class_file).readlines()]
candidate_num = len(classes)

In [4]:

def URI_parse(uri):
    """Parse a URI: remove the prefix, parse the name part (Camel cases are plit)"""
    uri = re.sub("http[a-zA-Z0-9:/._-]+#", "", uri)
    uri = uri.replace('_', ' ').replace('-', ' ').replace('.', ' ').replace('/', ' '). \
        replace('"', ' ').replace("'", ' ')
    words = []
    for item in uri.split():
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', item)
        for m in matches:
            word = m.group(0)
            if word.isalpha():
                words.append(word.lower())
    return words


def embed(model, instances):

    def word_embeding(inst):
        v = np.zeros(model.vector_size)
        if inst in uri_label:
            words = uri_label.get(inst)
            n = 0
            for word in words:
                if word in model.wv.index_to_key:
                    v += model.wv.get_vector(word)
                    n += 1
            return v / n if n > 0 else v
        else:
            return v

    feature_vectors = []
    for instance in instances:
        if FLAGS.Embed_Out_Words.lower() == 'yes' and FLAGS.Embed_Out_URI.lower() == 'yes':
            v_uri = model.wv.get_vector(instance) if instance in model.wv.index_to_key else np.zeros(model.vector_size)
            v_word = word_embeding(inst=instance)
            feature_vectors.append(np.concatenate((v_uri, v_word)))

        elif FLAGS.Embed_Out_Words.lower() == 'no' and FLAGS.Embed_Out_URI.lower() == 'yes':
            v_uri = model.wv.get_vector(instance) if instance in model.wv.index_to_key else np.zeros(model.vector_size)
            feature_vectors.append(v_uri)

        elif FLAGS.Embed_Out_Words.lower() == 'yes' and FLAGS.Embed_Out_URI.lower() == 'no':
            v_word = word_embeding(inst=instance)
            feature_vectors.append(v_word)

        else:
            print("Unknown embed out type")
            sys.exit(0)

    return feature_vectors


def pre_process_words(words):
    text = ' '.join([re.sub(r'https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE) for word in words])
    tokens = word_tokenize(text)
    processed_tokens = [token.lower() for token in tokens if token.isalpha()]
    return processed_tokens


# Extract corpus and learning embedding

In [5]:

uri_label = dict()
annotations = list()
for line in open(FLAGS.annotation_file, encoding="utf8").readlines():
    tmp = line.strip().split()
    if tmp[1] == 'http://www.w3.org/2000/01/rdf-schema#label':
        uri_label[tmp[0]] = pre_process_words(tmp[2:])
    elif tmp[0] in classes:
        annotations.append(tmp)

In [6]:

walk_sentences, axiom_sentences = list(), list()
if FLAGS.URI_Doc.lower() == 'yes':
    walks_ = get_rdf2vec_walks(onto_file=FLAGS.onto_file, walker_type=FLAGS.walker,
                               walk_depth=FLAGS.walk_depth, classes=classes)
    print('Extracted {} walks for {} classes!'.format(len(walks_), len(classes)))
    walk_sentences += [list(map(str, x)) for x in walks_]
    for line in open(FLAGS.axiom_file).readlines():
        axiom_sentence = [item for item in line.strip().split()]
        axiom_sentences.append(axiom_sentence)
    print('Extracted %d axiom sentences' % len(axiom_sentences))
URI_Doc = walk_sentences + axiom_sentences

Extracted 2218855 walks for 28182 classes!
Extracted 34184 axiom sentences


In [7]:
Lit_Doc = list()
if FLAGS.Lit_Doc.lower() == 'yes':
    for annotation in annotations:
        processed_words = pre_process_words(annotation[2:])
        if len(processed_words) > 0:
            Lit_Doc.append(uri_label[annotation[0]] + processed_words)
    print('Extracted %d literal annotations' % len(Lit_Doc))

    for sentence in walk_sentences:
        lit_sentence = list()
        for item in sentence:
            if item in uri_label:
                lit_sentence += uri_label[item]
            elif item.startswith('http://www.w3.org'):
                lit_sentence += [item.split('#')[1].lower()]
            else:
                lit_sentence += [item]
        Lit_Doc.append(lit_sentence)

    for sentence in axiom_sentences:
        lit_sentence = list()
        for item in sentence:
            lit_sentence += uri_label[item] if item in uri_label else [item.lower()]
        Lit_Doc.append(lit_sentence)

Extracted 52249 literal annotations


In [8]:

Mix_Doc = list()
if FLAGS.Mix_Doc.lower() == 'yes':
    for sentence in walk_sentences:
        if FLAGS.Mix_Type.lower() == 'all':
            for index in range(len(sentence)):
                mix_sentence = list()
                for i, item in enumerate(sentence):
                    if i == index:
                        mix_sentence += [item]
                    else:
                        if item in uri_label:
                            mix_sentence += uri_label[item]
                        elif item.startswith('http://www.w3.org'):
                            mix_sentence += [item.split('#')[1].lower()]
                        else:
                            mix_sentence += [item]
                Mix_Doc.append(mix_sentence)
        elif FLAGS.Mix_Type.lower() == 'random':
            random_index = random.randint(0, len(sentence)-1)
            mix_sentence = list()
            for i, item in enumerate(sentence):
                if i == random_index:
                    mix_sentence += [item]
                else:
                    if item in uri_label:
                        mix_sentence += uri_label[item]
                    elif item.startswith('http://www.w3.org'):
                        mix_sentence += [item.split('#')[1].lower()]
                    else:
                        mix_sentence += [item]
            Mix_Doc.append(mix_sentence)

    for sentence in axiom_sentences:
        if FLAGS.Mix_Type.lower() == 'all':
            for index in range(len(sentence)):
                random_index = random.randint(0, len(sentence) - 1)
                mix_sentence = list()
                for i, item in enumerate(sentence):
                    if i == random_index:
                        mix_sentence += [item]
                    else:
                        mix_sentence += uri_label[item] if item in uri_label else [item.lower()]
                Mix_Doc.append(mix_sentence)
        elif FLAGS.Mix_Type.lower() == 'random':
            random_index = random.randint(0, len(sentence)-1)
            mix_sentence = list()
            for i, item in enumerate(sentence):
                if i == random_index:
                    mix_sentence += [item]
                else:
                    mix_sentence += uri_label[item] if item in uri_label else [item.lower()]
            Mix_Doc.append(mix_sentence)

In [9]:

print('URI_Doc: %d, Lit_Doc: %d, Mix_Doc: %d' % (len(URI_Doc), len(Lit_Doc), len(Mix_Doc)))
all_doc = URI_Doc + Lit_Doc + Mix_Doc
random.shuffle(all_doc)

URI_Doc: 2253039, Lit_Doc: 2305288, Mix_Doc: 2253039


In [10]:
model_ = gensim.models.Word2Vec(all_doc, vector_size=FLAGS.embedsize, window=5, workers=multiprocessing.cpu_count(),
                                    sg=1, epochs=10, negative=25, min_count=1, seed=42)

In [11]:
classes_e = embed(model=model_, instances=classes)

In [12]:
if EMB_PATH.exists():
    raise FileExistsError(f"file {EMB_PATH} already exist, consider changing name to not confuse after")

save_obj = dict(
    classes_e = classes_e,
    model_config = dict(FLAGS)
)
with open(EMB_PATH, "wb") as file:
    pk.dump(save_obj, file)
# pd.DataFrame(classes_e).to_csv(FLAGS.save_path, index=False, header=False)
    