# Import Libraries

In [None]:
import os
import pandas as pd
import networkx as nx
import numpy as np
import random
import pickle

import edge2vec.transition3 as transitions
import edge2vec.edge2vec3 as edge2vec

import data_params as input_data_params

# Set Parameters

Specify which dataset is used.
*   1 (Original knowledge graph)
*   2 (Restructured knowledge graph)

In [3]:
dataset_prefix = input_data_params.dataset

disease_prefix = input_data_params.disease
assert disease_prefix == 'dmd' or 'hd' or 'oi'

embedding_method = 'e2v'

use_seed = input_data_params.e2v_seed

if use_seed:
    fixed_emb = '_seeded'
else:
    fixed_emb = ''

# Set Folder

In [4]:
curr_working_dir = os.getcwd()
curr_output_dir = os.path.join(curr_working_dir, 'output', disease_prefix)
dataset_output_dir = os.path.join(curr_output_dir, f'{dataset_prefix}_{embedding_method}{fixed_emb}')

if not os.path.exists(dataset_output_dir):
    os.mkdir(dataset_output_dir)
    print(f'Output folder for dataset {dataset_prefix} {disease_prefix} is created: {dataset_output_dir}')
else:
    print(f'Output folder for dataset {dataset_prefix} {disease_prefix} already exists and will be used: {dataset_output_dir}')

Output folder for dataset 2 already exists and will be used: C:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\hd\g2_e2v


In [5]:
def create_run_folder(run_nr: int):
    foldername = 'run_{:03d}'.format(run_nr)
    run_path = os.path.join(dataset_output_dir, foldername)
    os.mkdir(run_path)

    print(f'Output folder for current run on dataset {dataset_prefix} {disease_prefix} is created: {run_path}')
    return run_path

run_folders_list = []

for item in os.listdir(dataset_output_dir):
    curr_path = os.path.join(dataset_output_dir, item)
    if os.path.isdir(curr_path) and 'run' in item:
        run_folders_list.append(item)

curr_run_nr = 1
if len(run_folders_list) > 0:
    run_folders_list.sort(reverse=True)
    latest_run = run_folders_list[0]
    run_nr_str = latest_run.split('_')[1]

    next_run_nr = int(run_nr_str) + 1
    curr_run_nr = next_run_nr
    output_path = create_run_folder(next_run_nr)
else:
    output_path = create_run_folder(1)

Output folder for current run on dataset 2 is created: C:\Users\rosa-\Google Drive\Msc_Bioinformatics\thesis\XAIFO-ThesisProject\output\hd\g2_e2v\run_010


In [None]:
def set_seed_val(run_nr: int):
    if use_seed:
        seed_val = 10   # Fixed seed for each run
        seed_txt_file = os.path.join(output_path, 'seed_value.txt')
        with open(seed_txt_file, 'w') as f:
            f.write(f'Seed value for this run is: {seed_val}')
        print(f'File {seed_txt_file} has been saved storing the seed value used during the embedding step of current run.')

        return seed_val
    else:
        return None

seed = set_seed_val(curr_run_nr)
print(f'Current value of e2v embedding seed is {seed}')

# Load Nodes and Edges

In [6]:
nodes = pd.read_csv(f'output/{disease_prefix}/{dataset_prefix}_{disease_prefix}_indexed_nodes.csv')
nodes

Unnamed: 0,index_id,id,semantic,label,semantic_id
0,0,dictyBase:DDB_G0282941,gene,DDB0185273,4
1,1,WBPhenotype:0001133,phenotype,division axis defective,8
2,2,HP:0012005,phenotype,Deja vu aura,8
3,3,ENSEMBL:ENSPTRG00000043959,gene,ENSEMBL:ENSPTRG00000043959,4
4,4,HP:0001265,phenotype,Hyporeflexia,8
...,...,...,...,...,...
14877,14877,MGI:2676637,gene,H2-M11,4
14878,14878,MP:0000378,phenotype,absent hair follicles,8
14879,14879,WormBase:WBGene00006446,gene,atx-3,4
14880,14880,MGI:2654144,gene,Ercc6l,4


In [7]:
edges = pd.read_csv(f'output/{disease_prefix}/{dataset_prefix}_{disease_prefix}_indexed_edges.csv')
edges

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0000774,gametogenesis variant,8,9968,0
1,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0002188,male somatic gonad development variant,8,3292,0
2,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0001224,axon outgrowth variant,8,8349,0
3,WormBase:WBGene00006537,tbb-2,4,10272,in orthology relationship with,MGI:107812,Tubb5,4,4998,1
4,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0001100,early embryonic lethal,8,6541,0
...,...,...,...,...,...,...,...,...,...,...
230223,Q99460,26S proteasome non-ATPase regulatory subunit 1,5,4895,is product of,HGNC:9554,PSMD1,4,352,11
230224,P53396,ATP-citrate synthase,5,1449,is product of,HGNC:115,ACLY,4,10260,11
230225,Q60631,Growth factor receptor-bound protein 2,5,12956,is product of,MGI:95805,Grb2,4,6661,11
230226,1980,octreotide,3,5596,targets,P35372,Mu-type opioid receptor,5,6566,12


In [8]:
nodes.loc[nodes['index_id'] == 1840]

Unnamed: 0,index_id,id,semantic,label,semantic_id
1840,1840,ENSEMBL:ENSCAFG00000018930,gene,GIT1,4


In [9]:
edge2vec_df = edges.copy()
edge2vec_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0000774,gametogenesis variant,8,9968,0
1,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0002188,male somatic gonad development variant,8,3292,0
2,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0001224,axon outgrowth variant,8,8349,0
3,WormBase:WBGene00006537,tbb-2,4,10272,in orthology relationship with,MGI:107812,Tubb5,4,4998,1
4,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0001100,early embryonic lethal,8,6541,0
5,WormBase:WBGene00006537,tbb-2,4,10272,in orthology relationship with,ENSEMBL:ENSSSCG00000001379,ENSEMBL:ENSSSCG00000001379,4,13971,1
6,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0001102,mitotic spindle defective early emb,8,6832,0
7,WormBase:WBGene00006537,tbb-2,4,10272,interacts with,WormBase:WBGene00006533,tba-7,4,3902,2
8,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,WBPhenotype:0001037,sterile progeny,8,11086,0
9,WormBase:WBGene00006537,tbb-2,4,10272,causes condition,ZP:0017152,"germ cell development process quality, abnormal",8,5981,0


# Edge2Vec Embedding Preparation

Initialize edge list with keys 'type' and 'weight'. The 'weight' of each edge is initialized to 1.0. 

In [10]:
G = nx.from_pandas_edgelist(edge2vec_df, 'index_head', 'index_tail', 'type', create_using=nx.DiGraph(), edge_key= (('type', int),('id', int)))
G = G.to_undirected()   # for the E2V implementation, use indirected graph
for edge in G.edges():
    G[edge[0]][edge[1]]['weight'] = 1.0
    
for node in G.nodes(data=True):
    print('First node in graph:', node)
    break

for edge in G.edges(data=True):
    print('First edge in graph:', edge)
    break

print('Total number of edges is {}'.format(G.number_of_edges()))
print('Total number of nodes is {}'.format(G.number_of_nodes()))

First node in graph: (10272, {})
First edge in graph: (10272, 9968, {'type': 0, 'weight': 1.0})
Total number of edges is 135988
Total number of nodes is 14882


Count total number of edge types

In [11]:
type_size = len(set(edge2vec_df['type']))
print(f'There are {type_size} edge types')

There are 19 edge types


# Obtain Node Embeddings Using Edge2Vec

Define parameters and generate edge type transition matrix using Edge2Vec EM approach

In [12]:
e2v_params = {
    'dmd': {
        'prev': {
            'epochs': 10,
            'num_walks': 2,
            'walk_length': 7,
            'p': 0.70,
            'q': 1.0,
            'dim': 32
        },
        'restr': {
            'epochs': 5,
            'num_walks': 6,
            'walk_length': 7,
            'p': 0.75,
            'q': 1.0,
            'dim': 64
        }
    },
    'hd': {
        'prev': {
            'epochs': 10,
            'num_walks': 6,
            'walk_length': 5,
            'p': 0.75,
            'q': 0.5,
            'dim': 32
        },
        'restr': {
            'epochs': 10,
            'num_walks': 6,
            'walk_length': 5,
            'p': 0.5,
            'q': 1.0,
            'dim': 128
        }
    },
    'oi': {
        'prev': {
            'epochs': 10,
            'num_walks': 6,
            'walk_length': 7,
            'p': 1.0,
            'q': 0.5,
            'dim': 128
        },
        'restr': {
            'epochs': 10,
            'num_walks': 4,
            'walk_length': 7,
            'p': 0.5,
            'q': 0.5,
            'dim': 32
        }
    }
}
    
directed = True
e_step = 3

params_to_use = e2v_params[disease_prefix][dataset_prefix]

epochs = params_to_use['epochs']
num_walks = params_to_use['num_walks']
walk_length = params_to_use['walk_length']
p = params_to_use['p']
q = params_to_use['q']
dim = params_to_use['dim']

print(epochs, num_walks, walk_length, dim, p, q)

6 5 128 0.5 1 10


In [13]:
def get_e2v_embeddings(save=False):
    print('Initializing transition matrix...')
    M = transitions.initialize_edge_type_matrix(type_size)

    print('Updating transition matrix...')
    for i in range(epochs):
        walks = transitions.simulate_walks_1(G, num_walks, walk_length, M, directed, p, q, seed) # M step
        print(f'{i}th iteration of updating matrix')
        M = transitions.update_trans_matrix(walks, type_size, e_step) # E step

    print("Finished generating values for transition matrix!")
    
    print('Overview of resulting edge type transition matrix (float values have been truncated to fit the view!):')
    print('\n'.join(['\t'.join([str(cell)[:5] for cell in row]) for row in M]))
    
    if save:
        # Save transition matrix
        np.save(f'{output_path}/transitionmatrix_{dataset_prefix}_{disease_prefix}.npy', M)
    
    print('Generate walks constrained by edge type transition matrix...')
    walks = edge2vec.simulate_walks_2(G, num_walks, walk_length, M, p, q, seed)
    
    # Generate node embeddings using Word2Vec (skip-gram model) with as input the generated walks 
    window_size = walk_length - 1   # maximum distance between predicted and context node
    workers = 8 # threads used

    w2v_model = edge2vec.Word2Vec(walks, vector_size=dim, window=window_size, min_count=0, sg=1, workers=workers, epochs=epochs, seed=seed)
    
    word_vectors = w2v_model.wv
    if save:
        word_vectors.save(f'{output_path}/w2v_{dataset_prefix}_{disease_prefix}.dvectors')
        
    e2v_embedding = pd.DataFrame(columns = ['Node', 'Embedding'])
    e2v_embedding_list = []
    for _, key in enumerate(w2v_model.wv.index_to_key):
        e2v_embedding.loc[int(key)] = pd.Series({'Node':int(key), 'Embedding':list(w2v_model.wv[key])})
        e2v_embedding_list.append(list(w2v_model.wv[key]))
        
    e2v_embedding = e2v_embedding.sort_values('Node')
    
    return M, e2v_embedding, e2v_embedding_list

In [14]:
get_e2v_embeddings(save=True)

Initializing transition matrix...
Updating transition matrix...
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
0th iteration of updating matrix




Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
1th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
2th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
3th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
4th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
5th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
6th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
7th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
8th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
9th iteration of updating matrix
Finished generating values for transition matrix!
Overview of resulting edge type transition matrix (float values have been truncated to fit the view!):
0.5	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	0.5	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	0.5
1.0	0.