# Import Libraries

In [1]:
import os
import pandas as pd
import networkx as nx
import numpy as np
import random
import pickle

import edge2vec.transition3 as transitions
import edge2vec.edge2vec3 as edge2vec

# Set Seed

In [2]:
seed = None

# Set Parameters

Specify which dataset is used.
*   1 (Original knowledge graph)
*   2 (Restructured knowledge graph)

In [3]:
dataset_nr = 2
assert dataset_nr == 1 or 2

NO_GENE_PRODUCTS = 'no gene products'
SINGLE_RELATION_TYPE = 'only one relation type'
RANDOM_FEATURES = 'random node embedding values'
concept_changes = SINGLE_RELATION_TYPE

embedding_method = 'e2v'
    
if concept_changes == NO_GENE_PRODUCTS:
    suffix = '_nogeneprods'
elif concept_changes == SINGLE_RELATION_TYPE:
    suffix = '_singlerel'
elif concept_changes == RANDOM_FEATURES:
    suffix = '_randomfeat'
else:
    suffix = ''

if seed:
    fixed_emb = '_seeded'
else:
    fixed_emb = ''

# Set Folder

In [4]:
curr_working_dir = os.getcwd()
curr_output_dir = os.path.join(curr_working_dir, 'output')
dataset_output_dir = os.path.join(curr_output_dir, f'g{dataset_nr}_{embedding_method}{fixed_emb}{suffix}')

if not os.path.exists(dataset_output_dir):
    os.mkdir(dataset_output_dir)
    print(f'Output folder for dataset {dataset_nr} is created: {dataset_output_dir}')
else:
    print(f'Output folder for dataset {dataset_nr} already exists and will be used: {dataset_output_dir}')

Output folder for dataset 2 already exists and will be used: C:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g2_e2v_singlerel


In [5]:
def create_run_folder(foldername):
    run_path = os.path.join(dataset_output_dir, foldername)
    os.mkdir(run_path)

    print(f'Output folder for current run on dataset {dataset_nr} is created: {run_path}')
    return run_path

run_folders_list = []

for item in os.listdir(dataset_output_dir):
    curr_path = os.path.join(dataset_output_dir, item)
    if os.path.isdir(curr_path) and 'run' in item:
        run_folders_list.append(item)

if len(run_folders_list) > 0:
    run_folders_list.sort(reverse=True)
    latest_run = run_folders_list[0]
    run_nr_str = latest_run.split('_')[1]

    next_run_nr = int(run_nr_str) + 1
    next_run_folder = 'run_{:03d}'.format(next_run_nr)
    output_path = create_run_folder(next_run_folder)
else:
    first_run_folder = 'run_{:03d}'.format(1)
    output_path = create_run_folder(first_run_folder)

Output folder for current run on dataset 2 is created: C:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\g2_e2v_singlerel\run_010


# Load Nodes and Edges

In [6]:
nodes = pd.read_csv(f'output/indexed_nodes_{dataset_nr}{suffix}.csv')
nodes

Unnamed: 0,index_id,id,semantic,label,semantic_id
0,0,MP:0004187,phenotype,cardia bifida,9
1,1,ZP:0100138,phenotype,muscle tendon junction myotome increased amoun...,9
2,2,MGI:1346525,gene,Sgcd,5
3,3,OMIM:300377.0044,variant,"DMD, LYS770TER",11
4,4,ZP:0002210,phenotype,posterior lateral line neuromast primordium mi...,9
...,...,...,...,...,...
10270,10270,ZP:0014934,phenotype,atrioventricular valve development process qua...,9
10271,10271,ENSEMBL:ENSCAFG00000011207,gene,ENSEMBL:ENSCAFG00000011207,5
10272,10272,ENSEMBL:ENSXETG00000039922,gene,ENSEMBL:ENSXETG00000039922,5
10273,10273,ENSEMBL:ENSACAG00000010058,gene,ENSEMBL:ENSACAG00000010058,5


In [7]:
edges = pd.read_csv(f'output/indexed_edges_{dataset_nr}{suffix}.csv')
edges

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,WormBase:WBGene00006787,unc-52,5,304,edge,WormBase:WBGene00003929,pat-2,5,1542,0
1,WormBase:WBGene00006787,unc-52,5,304,edge,WormBase:WBGene00006789,unc-54,5,6544,0
2,WormBase:WBGene00006787,unc-52,5,304,edge,ENSEMBL:ENSSSCG00000015555,LAMC1,5,9268,0
3,WormBase:WBGene00006787,unc-52,5,304,edge,ZFIN:ZDB-GENE-021226-3,lamc1,5,5387,0
4,WormBase:WBGene00006787,unc-52,5,304,edge,ENSEMBL:ENSOANG00000001050,ENSEMBL:ENSOANG00000001050,5,2204,0
...,...,...,...,...,...,...,...,...,...,...
85987,458,scopolamine butylbromide,4,5945,edge,P11229,Muscarinic acetylcholine receptor M1,6,5919,0
85988,OMIM:300377.0080,"DMD, IVS62, A-G, -285",11,1578,edge,HGNC:2928,DMD,5,3310,0
85989,5297,dacomitinib,4,8798,edge,P12931,Proto-oncogene tyrosine-protein kinase Src,6,2379,0
85990,ClinVarVariant:981988,NC_000023.11:g.(31875374_31929595)_(31968515_3...,11,8189,edge,HGNC:2928,DMD,5,3310,0


In [8]:
nodes.loc[nodes['index_id'] == 1253]

Unnamed: 0,index_id,id,semantic,label,semantic_id
1253,1253,1920,drug,nicotine,4


In [9]:
edge2vec_df = edges.copy()
edge2vec_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,WormBase:WBGene00006787,unc-52,5,304,edge,WormBase:WBGene00003929,pat-2,5,1542,0
1,WormBase:WBGene00006787,unc-52,5,304,edge,WormBase:WBGene00006789,unc-54,5,6544,0
2,WormBase:WBGene00006787,unc-52,5,304,edge,ENSEMBL:ENSSSCG00000015555,LAMC1,5,9268,0
3,WormBase:WBGene00006787,unc-52,5,304,edge,ZFIN:ZDB-GENE-021226-3,lamc1,5,5387,0
4,WormBase:WBGene00006787,unc-52,5,304,edge,ENSEMBL:ENSOANG00000001050,ENSEMBL:ENSOANG00000001050,5,2204,0
5,WormBase:WBGene00006787,unc-52,5,304,edge,WBPhenotype:0001171,shortened life span,9,5475,0
6,WormBase:WBGene00006787,unc-52,5,304,edge,WBPhenotype:0001425,receptor mediated endocytosis defective,9,3057,0
7,WormBase:WBGene00006787,unc-52,5,304,edge,WBPhenotype:0000781,body wall muscle thin filament variant,9,7821,0
8,WormBase:WBGene00006787,unc-52,5,304,edge,ENSEMBL:ENSPTRG00000021480,ENSEMBL:ENSPTRG00000021480,5,6347,0
9,WormBase:WBGene00006787,unc-52,5,304,edge,ENSEMBL:ENSSSCG00000015556,LAMC2,5,769,0


# Edge2Vec Embedding Preparation

Initialize edge list with keys 'type' and 'weight'. The 'weight' of each edge is initialized to 1.0. 

In [10]:
G = nx.from_pandas_edgelist(edge2vec_df, 'index_head', 'index_tail', 'type', create_using=nx.DiGraph(), edge_key= (('type', int),('id', int)))
G = G.to_undirected()   # for the E2V implementation, use indirected graph
for edge in G.edges():
    G[edge[0]][edge[1]]['weight'] = 1.0
    
for node in G.nodes(data=True):
    print('First node in graph:', node)
    break

for edge in G.edges(data=True):
    print('First edge in graph:', edge)
    break

print('Total number of edges is {}'.format(G.number_of_edges()))
print('Total number of nodes is {}'.format(G.number_of_nodes()))

First node in graph: (304, {})
First edge in graph: (304, 1542, {'type': 0, 'weight': 1.0})
Total number of edges is 55032
Total number of nodes is 10275


Count total number of edge types

In [11]:
type_size = len(set(edge2vec_df['type']))
print(f'There are {type_size} edge types')

There are 1 edge types


# Obtain Node Embeddings Using Edge2Vec

Define parameters and generate edge type transition matrix using Edge2Vec EM approach

In [12]:
# Use parameter values obtained with hyperparameter optimization
if dataset_nr == 1:
    epoch = 10  # number of epochs
    num_walks = 2   # number of walks per node
    walk_length = 7 # length of each walk
    p = 0.7
    q = 1
    
    dim = 32    # desired dimension of the node embedding
    epochs = 10
else:
    epoch = 5  # number of epochs
    num_walks = 6   # number of walks per node
    walk_length = 7 # length of each walk
    p = 0.75
    q = 1
    
    dim = 64
    epochs = 10
    
directed = True
e_step = 3

In [13]:
def get_e2v_embeddings(save=False):
    print('Initializing transition matrix...')
    M = transitions.initialize_edge_type_matrix(type_size)

    print('Updating transition matrix...')
    for i in range(epoch):
        walks = transitions.simulate_walks_1(G, num_walks, walk_length, M, directed, p, q, seed) # M step
        print(f'{i}th iteration of updating matrix')
        M = transitions.update_trans_matrix(walks, type_size, e_step) # E step

    print("Finished generating values for transition matrix!")
    
    print('Overview of resulting edge type transition matrix (float values have been truncated to fit the view!):')
    print('\n'.join(['\t'.join([str(cell)[:5] for cell in row]) for row in M]))
    
    if save:
        # Save transition matrix
        np.save(f'{output_path}/transitionmatrix_{dataset_nr}.npy', M)
    
    print('Generate walks constrained by edge type transition matrix...')
    walks = edge2vec.simulate_walks_2(G, num_walks, walk_length, M, p, q, seed)
    
    # Generate node embeddings using Word2Vec (skip-gram model) with as input the generated walks 
    window_size = walk_length - 1   # maximum distance between predicted and context node
    workers = 8 # threads used

    w2v_model = edge2vec.Word2Vec(walks, vector_size=dim, window=window_size, min_count=0, sg=1, workers=workers, epochs=epochs, seed=seed)
    
    word_vectors = w2v_model.wv
    if save:
        word_vectors.save(f'{output_path}/w2v_{dataset_nr}.dvectors')
        
    e2v_embedding = pd.DataFrame(columns = ['Node', 'Embedding'])
    e2v_embedding_list = []
    for _, key in enumerate(w2v_model.wv.index_to_key):
        e2v_embedding.loc[int(key)] = pd.Series({'Node':int(key), 'Embedding':list(w2v_model.wv[key])})
        e2v_embedding_list.append(list(w2v_model.wv[key]))
        
    e2v_embedding = e2v_embedding.sort_values('Node')
    
    return M, e2v_embedding, e2v_embedding_list

In [14]:
if concept_changes == RANDOM_FEATURES:
    random.seed(111)
    nodes_total = len(G.nodes)
    random_feats = pd.DataFrame(index=range(nodes_total), columns=['Node', 'Embedding'])
    random_feats['Node'] = random_feats.index
    
    random_emb_nodes = []
    for _ in range(nodes_total):
        random_emb = []
        for _ in range(dim):
            random_emb.append(random.uniform(-0.5, 0.5))
        random_emb_nodes.append(random_emb)
        
    random_feats['Embedding'] = random_emb_nodes
    
    with open(f'{output_path}/random_node_embeddings_{dataset_nr}.pkl', 'wb') as f:
        pickle.dump(random_feats, f)
else:
    get_e2v_embeddings(save=True)

Initializing transition matrix...
Updating transition matrix...
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
0th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
1th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
2th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
3th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
4th iteration of updating matrix
Finished generating values for transition matrix!
Overview of resulting edge type transition matrix (float values have been truncated to fit the view!):
0.5
Generate walks constrained by edge type transition matrix...
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6


## Check embedding similarities

In [15]:
#transition_M_1, e2v_embedding_1_df, e2v_embedding_1_list = get_e2v_embeddings()
#transition_M_2, e2v_embedding_2_df, e2v_embedding_2_list = get_e2v_embeddings()

In [16]:
#from sklearn.metrics.pairwise import cosine_similarity

#similarity_matrix = cosine_similarity(e2v_embedding_1_list, e2v_embedding_2_list)

#cos_similarities = []
#for node in G.nodes():
#    cos_similarities.append(similarity_matrix[node][node])

In [17]:
#cos_similarities