In [1]:
from helper_classes import PYKE
from helper_classes import Parser
from helper_classes import DataAnalyser
from helper_classes import PPMI

import util as ut
import numpy as np

In [2]:
random_state = 1
np.random.seed(random_state)

# Learning DBpedia Embedings with PYKE

1. Download drugbank.nq.gz from http://download.bio2rdf.org/#/release/4/drugbank/
2. Extract drugbank.nq and locate the file under KGs/Drugbank.

In [3]:
# DEFINE MODEL PARAMS
K = 45
num_of_dims = 50
bound_on_iter = 30
omega = 0.45557
e_release = 0.0414

In [4]:
kg_root = 'KGs/Drugbank'
kg_path = kg_root + '/'

In [5]:
# As Drugbank is serializedin N-Quads format
ut.triple=4

In [8]:
storage_path, experiment_folder = ut.create_experiment_folder()

parser = Parser(p_folder=storage_path, k=K)

parser.set_similarity_measure(PPMI)

model = PYKE()

analyser = DataAnalyser(p_folder=storage_path)


# For the illustration purpusoes lets consider only the first 1000 ntriples
#holder = parser.pipeline_of_preprocessing(kg_path,bound=1000)

holder = parser.pipeline_of_preprocessing(kg_path)




###### Preprocessing  starts ######


###### Constructing Inverted Index  starts ######
Number of RDF triples: 3146309
Number of vocabulary terms:  521363
Number of subjects:  421121
Constructing Inverted Index  took  42.67512631416321  seconds



###### Calculation of PPMIs  starts ######
Calculation of PPMIs  took  113.38293743133545  seconds

Preprocessing  took  156.2215337753296  seconds



In [None]:
# Note that the original data contains many literals. 
# Hence 7 seconds of overhead stems from reading literals

In [9]:
vocab_size = len(holder)

embeddings = ut.randomly_initialize_embedding_space(vocab_size, num_of_dims)

learned_embeddings = model.pipeline_of_learning_embeddings(e=embeddings,
                                                           max_iteration=bound_on_iter,
                                                           energy_release_at_epoch=e_release,
                                                           holder=holder, omega=omega)



###### Generating Embeddings:  starts ######
EPOCH:  0
EPOCH:  1
EPOCH:  2
EPOCH:  3
EPOCH:  4
EPOCH:  5
EPOCH:  6
EPOCH:  7
EPOCH:  8
EPOCH:  9
EPOCH:  10
EPOCH:  11
EPOCH:  12
EPOCH:  13
EPOCH:  14
EPOCH:  15
EPOCH:  16
EPOCH:  17
EPOCH:  18
EPOCH:  19
EPOCH:  20
EPOCH:  21
EPOCH:  22
EPOCH:  23
EPOCH:  24

 Epoch:  24
System energy: -0.03499999999999983
Generating Embeddings:  took  1334.2774987220764  seconds



In [12]:
minutes=(1334.2774987220764 + 156.2215337753296)/60

print('Total runtime ',minutes) # rounded 25 minutes

Total runtime  24.841650541623434


In [13]:
# To use memory efficiently
del holder
del embeddings

In [14]:
analyser.perform_type_prediction(learned_embeddings)



###### Type Prediction  starts ######
K values: [1, 3, 5, 10, 15, 30, 50, 100]
##### 1 ####
Mean type prediction [0.96962103]
##### 3 ####
Mean type prediction [0.69831727]
##### 5 ####
Mean type prediction [0.62199829]
##### 10 ####
Mean type prediction [0.56098671]
##### 15 ####
Mean type prediction [0.48592903]
##### 30 ####
Mean type prediction [0.37220401]
##### 50 ####
Mean type prediction [0.27262127]
##### 100 ####
Mean type prediction [0.21259995]
Type Prediction  took  7003.54514169693  seconds



# To see what happens under the hood


```python
@performance_debugger('Type Prediction')
def perform_type_prediction(self, df):

    def create_binary_type_vector(t_types, a_types):
        vector = np.zeros(len(all_types))
        i = [a_types.index(_) for _ in t_types]
        vector[i] = 1
        return vector

    def create_binary_type_prediction_vector(t_types, a_types):
        vector = np.zeros(len(all_types))
        i = [a_types.index(_) for _ in itertools.chain.from_iterable(t_types)]
        vector[i] += 1
        return vector

    # get the types. Mapping from the index of subject to the index of object
    type_info = ut.deserializer(path=self.p_folder, serialized_name='type_info')

    # get the index of objects / get type information =>>> s #type o
    all_types = sorted(set.union(*list(type_info.values())))


    # Consider only points with type infos.
    e_w_types = df.loc[list(type_info.keys())]

    neigh = NearestNeighbors(n_neighbors=101, algorithm='kd_tree', metric='euclidean', n_jobs=-1).fit(
        e_w_types)

    # Get similarity results for selected entities
    df_most_similars = pd.DataFrame(neigh.kneighbors(e_w_types, return_distance=False))

    # As sklearn implementation of kneighbors returns the point itself as most similar point
    df_most_similars.drop(columns=[0], inplace=True)


    # If kNN operProbabily no need to mapping as we calculated for all
    mapper = dict(zip(list(range(len(e_w_types))), e_w_types.index.values))
    # The values of most similars are mapped to original vocabulary positions
    df_most_similars = df_most_similars.applymap(lambda x: mapper[x])


    k_values = [1, 3, 5, 10, 15, 30, 50, 100]

    print('K values:',k_values)
    for k in k_values:
        print('#####', k, '####')
        similarities = list()
        for _, S in df_most_similars.iterrows():
            true_types = type_info[S.values[0]]
            type_predictions = [type_info[_] for _ in S.values[1:k + 1]]

            vector_true = create_binary_type_vector(true_types, all_types)
            vector_prediction = create_binary_type_prediction_vector(type_predictions, all_types)

            sim = cosine(vector_true, vector_prediction)
            similarities.append(1 - sim)

        report = pd.DataFrame(similarities)
        print('Mean type prediction', report.mean().values)
            
```