In [None]:
import sys
import os
cwd = os.getcwd() + '\\main\\'
sys.path.append(cwd)
print(cwd)

In [None]:
import pickle
from time import time
from sklearn.metrics.pairwise import cosine_distances
from main import siamese_word_model_neg_sampling_distillation_loss
from main import ppdb_utils
from main import utils
from main import params
from main.tree import addOOVwords
from main import evaluate
from main import marriage
import numpy as np
import lasagne
import theano
import theano.tensor as T

###### Load the Configuration Parameters:

For the Out-of-Vocabulary (OOV) words, we initialize them with values sampled from a normal distribution ($X \sim \mathcal{N}(\mu=0,\,\sigma^{2}=0.01)\,$).

Please set the input of the load_from_yaml method to:
1. 'data/ma-nci/ma-nci_params.yaml' for performing ontology matching between the **Adult Mouse Anatomical Dictionary** and the **Foundation Model of Anatomy** ontology.
2. 'data/fma-nci/fma-nci_params.yaml' for performing ontology matching between the **Foundation Model of Anatomy** ontology and the **NCI Thesaurus**.
3. 'data/fma-snomed/fma-snomed_params.yaml' for performing ontology matching between the **Foundation Model of Anatomy** ontology and **SNOMED CT**.

In [None]:
params = params.params()
params.load_from_yaml('data/fma-snomed/fma-snomed_params.yaml')
(words, We) = utils.getWordmap(params.wordfile)
print('The number of words that existed in the pre-trained words vectors trained on PubMed and PMC is: %d' % (len(words)))
# Read the training data and add the OOV words in the dictionary.
examples = utils.getDataset(params.train, words)
We = addOOVwords(examples, words, We, mean=0, sigma=0.01)

params.batchsize = len(examples)

###### Print the resulted Configuration Parameters:

In [None]:
print(params)

###### Initialize the Phrase Retrofitting Component:

In [None]:
model = siamese_word_model_neg_sampling_distillation_loss.ppdb_word_model(We, params)

###### Extract possible examples of 'descriptive associated' terms:

In [None]:
terms_of_ontology_1 = params.terms_of_ontology_1
terms_of_ontology_2 = params.terms_of_ontology_2
ontology_1_ants = utils.getAntonyms(terms_of_ontology_1, words)
ontology_2_ants = utils.getAntonyms(terms_of_ontology_2, words)
ants = ontology_1_ants + ontology_2_ants
utils.getAntRepresentations(model, ants)
syns = utils.createSet(examples, words)
utils.getAntRepresentations(model, syns)

###### Train the Phrase Retroffiting Component:

In [None]:
ppdb_utils.train(model, examples, words, params, synonyms=syns, antonyms=ants, start=2)

###### Run (1) the <cite>[McVitie et al.][1]</cite> algorithm for solving the Stable Marriage Assignment problem and (2) display the resulted performance:


[1]:https://link.springer.com/article/10.1007/BF01934199

In [None]:
from time import time
start_time = time()
alignments = marriage.ontology_alignment(model, terms_of_ontology_1, terms_of_ontology_2, words, ceil=0.2)
end_time = time()
print("Total matching time:", (end_time - start_time))
results = marriage.alignment_evaluation(model, words, alignments, params.ground_truth_alignments)

###### Preprocess the synonymy data so as to feed them to the <cite>[Denoising Autoencoder][1]</cite>

Note: We preprocess the data as described in <cite>[LeCun et al.][2]</cite>


[1]:https://dl.acm.org/citation.cfm?id=1390294
[2]:http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf

In [None]:
from main.utils import prepare_data

left = []
for example in examples:
    example[0].populate_embeddings(words)
    left.append(example[0].embeddings)
right = []
for example in examples:
    example[1].populate_embeddings(words)
    right.append(example[1].embeddings)
    
X1, M1 = prepare_data(left)
X2, M2 = prepare_data(right)
# Data preprocessing
embg1 = model.feedforward_function(X1, M1)
embg2 = model.feedforward_function(X2, M2)
embg1 = embg1 - embg1.mean(axis=1, keepdims=True)
embg2 = embg2 - embg2.mean(axis=1, keepdims=True)

embg1 = embg1/ np.linalg.norm(embg1)
embg2 = embg1/ np.linalg.norm(embg2)
# End of Data preprocessing

# As the number of training data is not huge, we extract only the 1% of them for checking the validation loss.
train_len = int(len(embg1)*0.99)
train_origins = embg1[:train_len]
train_targets = embg2[:train_len]
test_origins = embg1[train_len:]
test_targets = embg2[train_len:]

train_all1 = np.concatenate((train_origins, train_origins, train_targets, train_targets), axis=0)
train_all2 = np.concatenate((train_origins, train_targets, train_origins, train_targets), axis=0)

###### Initialize the Denoising Autoencoder (DAE):

In [None]:
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras import regularizers

# this is the size of our encoded representations
encoding_dim = 32  # 32 floats -> compression of factor 6.25, assuming the input is 200 floats

# this is our input placeholder
input_img = Input(shape=(200,))
# "encoded" is the encoded representation of the input
v = 0.4
dropped_input = Dropout(v)(input_img)

encoded = Dense(encoding_dim, 
                activation='relu',
                activity_regularizer=regularizers.l1(10e-05)
               )(dropped_input)
decoded = Dense(200, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input_img, decoded)

###### Define the DAE's loss and the optimization method:

In [None]:
# this model maps an input to its encoded representation
encoder = Model(input_img, encoded)

# create a placeholder for an encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))

autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

###### Train the DAE:

In [None]:
time_callback = evaluate.TimeHistory()
autoencoder.fit(train_all1, train_all2,
                epochs=15,
                batch_size=256,
                shuffle=True,
                validation_data=(test_origins, test_targets),
                callbacks=[time_callback])
print("DAE's total training time:", (time_callback.overall_training_time()))

###### Define some helpful functions so as to get the output of the DAE:

In [None]:
def sen2em(model,sent,words):
    d = evaluate.sen2Embgs(model,sent,words)
    emb = np.zeros(200)
    
    for key, value in d.items():
        emb += value
    return emb/len(d)

def ae_sim(model, string, words):
    from sklearn.metrics.pairwise import cosine_similarity
    sentence_1, sentence_2 = string.split(', ')
    x1 = sen2em(model,sentence_1,words)
    x2 = sen2em(model,sentence_2,words)
    encoded_imgs_1 = encoder.predict(x1.reshape(1,200))
    encoded_imgs_2 = encoder.predict(x2.reshape(1,200))
    return np.squeeze(cosine_similarity(encoded_imgs_1,encoded_imgs_2))

###### Run the Outlier Detection Component based on the DAE:

In [None]:
cnt=0
detected_outliers=[]
outlier_threshold = 0.2
for x in results[0]:
    string = x.split(') -> ')[0][1:]
    ae_value = 1-ae_sim(model, string, words)
    if ae_value >= outlier_threshold:
        detected_outliers.append(x)
        cnt+=1
    elif ae_value < outlier_threshold:
        print(x + ' ||| ' +str(ae_value))
print('The number of detected outliers is: %d' % (cnt))

###### Compute the number of correctly detected misalignments and the number of correct alignments that were marked wrongly as misalignments:

In [None]:
wrong, correct = 0, 0
for x in detected_outliers:
    if '\x1b[0m' in x:
        wrong+=1
    else:
        correct+=1

######  Display the resulted performance after the application of the DAE based Outlier Detection Component:

In [None]:
print('The DAE outlier detector discovered correctly %d misalignments' % (wrong))
print('However, the DAE outlier detector also confused %d true alignments as misalignments' % (correct))

precision = (1.0*(results[3]-correct))/((results[3]-correct)+(results[4]-wrong))
recall = (1.0*(results[3]-correct))/results[2]

print('The new precision is: %f' % (precision))
print('The new recall    is: %f' % (recall))
print('The new F1-score  is  %f' % ((2.0*precision*recall)/(precision+recall)))