# Import Libraries

In [1]:
import os
import pandas as pd
import networkx as nx
import numpy as np
import random
import pickle

import edge2vec.transition3 as transitions
import edge2vec.edge2vec3 as edge2vec

import data_params as input_data_params

# Set Parameters

Specify which dataset is used.
*   1 (Original knowledge graph)
*   2 (Restructured knowledge graph)

In [2]:
dataset_prefix = input_data_params.dataset

disease_prefix = input_data_params.disease
assert disease_prefix == 'dmd' or 'hd' or 'oi'

embedding_method = 'e2v'

use_seed = input_data_params.e2v_seed

if use_seed:
    fixed_emb = '_seeded'
else:
    fixed_emb = ''

print(dataset_prefix, disease_prefix)

restr hd


# Set Folder

In [3]:
curr_working_dir = os.path.dirname(os.getcwd())
curr_output_dir = os.path.join(curr_working_dir, 'output', disease_prefix)
dataset_output_dir = os.path.join(curr_output_dir, f'{dataset_prefix}_{embedding_method}{fixed_emb}')

if not os.path.exists(dataset_output_dir):
    os.mkdir(dataset_output_dir)
    print(f'Output folder for dataset {dataset_prefix} {disease_prefix} is created: {dataset_output_dir}')
else:
    print(f'Output folder for dataset {dataset_prefix} {disease_prefix} already exists and will be used: {dataset_output_dir}')

Output folder for dataset restr hd already exists and will be used: C:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\restr_e2v


In [4]:
def create_run_folder(run_nr: int):
    foldername = 'run_{:03d}'.format(run_nr)
    run_path = os.path.join(dataset_output_dir, foldername)
    os.mkdir(run_path)

    print(f'Output folder for current run on dataset {dataset_prefix} {disease_prefix} is created: {run_path}')
    return run_path

run_folders_list = []

for item in os.listdir(dataset_output_dir):
    curr_path = os.path.join(dataset_output_dir, item)
    if os.path.isdir(curr_path) and 'run' in item:
        run_folders_list.append(item)

curr_run_nr = 1
if len(run_folders_list) > 0:
    run_folders_list.sort(reverse=True)
    latest_run = run_folders_list[0]
    run_nr_str = latest_run.split('_')[1]

    next_run_nr = int(run_nr_str) + 1
    curr_run_nr = next_run_nr
    output_path = create_run_folder(next_run_nr)
else:
    output_path = create_run_folder(1)

Output folder for current run on dataset restr hd is created: C:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\restr_e2v\run_010


In [5]:
def set_seed_val(run_nr: int):
    if use_seed:
        seed_val = 10   # Fixed seed for each run
        seed_txt_file = os.path.join(output_path, 'seed_value.txt')
        with open(seed_txt_file, 'w') as f:
            f.write(f'Seed value for this run is: {seed_val}')
        print(f'File {seed_txt_file} has been saved storing the seed value used during the embedding step of current run.')

        return seed_val
    else:
        return None

seed = set_seed_val(curr_run_nr)
print(f'Current value of e2v embedding seed is {seed}')

Current value of e2v embedding seed is None


# Load Nodes and Edges

In [6]:
nodes = pd.read_csv(f'../output/{disease_prefix}/{dataset_prefix}_{disease_prefix}_indexed_nodes.csv')
nodes

Unnamed: 0,index_id,id,semantic,label,semantic_id
0,0,ENSEMBL:ENSSSCG00000033987,gene,UBE2E3,4
1,1,MP:0000790,phenotype,abnormal stratification in cerebral cortex,8
2,2,ZP:0011626,phenotype,"cell trunk apoptotic, abnormal",8
3,3,ZP:0019272,phenotype,pronephric nephron tubule epithelial cell diff...,8
4,4,ENSEMBL:ENSACAG00000007327,gene,ENSEMBL:ENSACAG00000007327,4
...,...,...,...,...,...
14877,14877,MP:0008584,phenotype,photoreceptor outer segment degeneration,8
14878,14878,ENSEMBL:ENSFCAG00000011716,gene,ENSEMBL:ENSFCAG00000011716,4
14879,14879,MP:0009412,phenotype,skeletal muscle fiber degeneration,8
14880,14880,ZP:0002448,phenotype,"retinal outer nuclear layer disorganized, abno...",8


In [7]:
edges = pd.read_csv(f'../output/{disease_prefix}/{dataset_prefix}_{disease_prefix}_indexed_edges.csv')
edges

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,HGNC:11280,SQSTM1,4,9467,interacts with,HGNC:9539,PSMB2,4,12355,0
1,FlyBase:FBgn0264855,AP-2alpha,4,6589,in orthology relationship with,MGI:101921,Ap2a1,4,10645,1
2,SGD:S000001709,VPS1,4,9926,in orthology relationship with,WormBase:WBGene00001134,eat-3,4,8881,1
3,RGD:1595923,RT1-A2,4,9824,in orthology relationship with,ENSEMBL:ENSDARG00000115781,CR339041.3,4,14166,1
4,WormBase:WBGene00012735,sptf-3,4,1165,in orthology relationship with,FlyBase:CG5669,FlyBase:CG5669,4,3446,1
...,...,...,...,...,...,...,...,...,...,...
230223,ENSEMBL:ENSGALG00000003800,PABPC4,4,942,in orthology relationship with,ENSEMBL:ENSECAG00000009396,ENSEMBL:ENSECAG00000009396,4,7423,1
230224,ENSEMBL:ENSECAG00000006095,ENSEMBL:ENSECAG00000006095,4,541,in orthology relationship with,ENSEMBL:ENSGALG00000050515,A0A3Q2UAA5,4,13107,1
230225,ZFIN:ZDB-GENE-060825-293,rab9a,4,4090,in orthology relationship with,ENSEMBL:ENSCAFG00000011750,ENSEMBL:ENSCAFG00000011750,4,3844,1
230226,ENSEMBL:ENSGALG00000008534,POLR2H,4,11884,in orthology relationship with,ZFIN:ZDB-GENE-050417-66,polr2h,4,3372,1


In [8]:
nodes.loc[nodes['index_id'] == 1041]

Unnamed: 0,index_id,id,semantic,label,semantic_id
1041,1041,ZFIN:ZDB-GENE-140820-11,gene,mhc1lja,4


In [9]:
edge2vec_df = edges.copy()
edge2vec_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,HGNC:11280,SQSTM1,4,9467,interacts with,HGNC:9539,PSMB2,4,12355,0
1,FlyBase:FBgn0264855,AP-2alpha,4,6589,in orthology relationship with,MGI:101921,Ap2a1,4,10645,1
2,SGD:S000001709,VPS1,4,9926,in orthology relationship with,WormBase:WBGene00001134,eat-3,4,8881,1
3,RGD:1595923,RT1-A2,4,9824,in orthology relationship with,ENSEMBL:ENSDARG00000115781,CR339041.3,4,14166,1
4,WormBase:WBGene00012735,sptf-3,4,1165,in orthology relationship with,FlyBase:CG5669,FlyBase:CG5669,4,3446,1
5,Xenbase:XB-GENE-486558,pias1,4,8551,interacts with,Xenbase:XB-GENE-967617,cetn1,4,6209,0
6,ENSEMBL:ENSCAFG00000017516,CALM1,4,10763,in orthology relationship with,FlyBase:FBgn0034774,CG13526,4,1670,1
7,ENSEMBL:ENSECAG00000022119,ENSEMBL:ENSECAG00000022119,4,8715,in orthology relationship with,ENSEMBL:ENSMODG00000015091,ENSEMBL:ENSMODG00000015091,4,3536,1
8,ENSEMBL:ENSRNOG00000049403,ENSEMBL:ENSRNOG00000049403,4,2590,in orthology relationship with,ENSEMBL:ENSCAFG00000029076,CCDC126,4,10793,1
9,Xenbase:XB-GENE-952286,cenpe,4,8623,in orthology relationship with,HGNC:1856,CENPE,4,3187,1


# Edge2Vec Embedding Preparation

Initialize edge list with keys 'type' and 'weight'. The 'weight' of each edge is initialized to 1.0. 

In [10]:
G = nx.from_pandas_edgelist(edge2vec_df, 'index_head', 'index_tail', 'type', create_using=nx.DiGraph(), edge_key= (('type', int),('id', int)))
G = G.to_undirected()   # for the E2V implementation, use indirected graph
for edge in G.edges():
    G[edge[0]][edge[1]]['weight'] = 1.0
    
for node in G.nodes(data=True):
    print('First node in graph:', node)
    break

for edge in G.edges(data=True):
    print('First edge in graph:', edge)
    break

print('Total number of edges is {}'.format(G.number_of_edges()))
print('Total number of nodes is {}'.format(G.number_of_nodes()))

First node in graph: (9467, {})
First edge in graph: (9467, 12355, {'type': 0, 'weight': 1.0})
Total number of edges is 135988
Total number of nodes is 14882


Count total number of edge types

In [11]:
print(set(edge2vec_df['type']))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}


In [12]:
edge2vec_df.loc[edge2vec_df['type'] == -1].head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type


In [13]:
type_size = len(set(edge2vec_df['type']))
print(f'There are {type_size} edge types')

There are 19 edge types


# Obtain Node Embeddings Using Edge2Vec

Define parameters and generate edge type transition matrix using Edge2Vec EM approach

In [14]:
e2v_params = {
    'dmd': {
        'prev': {
            'epochs': 10,
            'num_walks': 2,
            'walk_length': 7,
            'p': 0.70,
            'q': 1.0,
            'dim': 32
        },
        'restr': {
            'epochs': 5,
            'num_walks': 6,
            'walk_length': 7,
            'p': 0.75,
            'q': 1.0,
            'dim': 64
        }
    },
    'hd': {
        'prev': {
            'epochs': 10,
            'num_walks': 6,
            'walk_length': 7,
            'p': 0.5,
            'q': 0.75,
            'dim': 64
        },
        'restr': {
            'epochs': 10,
            'num_walks': 2,
            'walk_length': 7,
            'p': 1.0,
            'q': 1.0,
            'dim': 128
        }
    },
    'oi': {
        'prev': {
            'epochs': 10,
            'num_walks': 6,
            'walk_length': 7,
            'p': 1.0,
            'q': 0.5,
            'dim': 128
        },
        'restr': {
            'epochs': 10,
            'num_walks': 4,
            'walk_length': 7,
            'p': 0.5,
            'q': 0.5,
            'dim': 32
        }
    }
}
    
directed = True
e_step = 3

params_to_use = e2v_params[disease_prefix][dataset_prefix]

epochs = params_to_use['epochs']
num_walks = params_to_use['num_walks']
walk_length = params_to_use['walk_length']
p = params_to_use['p']
q = params_to_use['q']
dim = params_to_use['dim']

print(epochs, num_walks, walk_length, dim, p, q)

10 2 7 128 1.0 1.0


In [15]:
def get_e2v_embeddings(save=False):
    print('Initializing transition matrix...')
    M = transitions.initialize_edge_type_matrix(type_size)

    print('Updating transition matrix...')
    for i in range(epochs):
        walks = transitions.simulate_walks_1(G, num_walks, walk_length, M, directed, p, q, seed) # M step
        print(f'{i}th iteration of updating matrix')
        M = transitions.update_trans_matrix(walks, type_size, e_step) # E step

    print("Finished generating values for transition matrix!")
    
    print('Overview of resulting edge type transition matrix (float values have been truncated to fit the view!):')
    print('\n'.join(['\t'.join([str(cell)[:5] for cell in row]) for row in M]))
    
    if save:
        # Save transition matrix
        np.save(f'{output_path}/transitionmatrix_{dataset_prefix}_{disease_prefix}.npy', M)
    
    print('Generate walks constrained by edge type transition matrix...')
    walks = edge2vec.simulate_walks_2(G, num_walks, walk_length, M, p, q, seed)
    
    # Generate node embeddings using Word2Vec (skip-gram model) with as input the generated walks 
    window_size = walk_length - 1   # maximum distance between predicted and context node
    workers = 8 # threads used

    w2v_model = edge2vec.Word2Vec(walks, vector_size=dim, window=window_size, min_count=0, sg=1, workers=workers, epochs=epochs, seed=seed)
    
    word_vectors = w2v_model.wv
    if save:
        word_vectors.save(f'{output_path}/w2v_{dataset_prefix}_{disease_prefix}.dvectors')
        
    e2v_embedding = pd.DataFrame(columns = ['Node', 'Embedding'])
    e2v_embedding_list = []
    for _, key in enumerate(w2v_model.wv.index_to_key):
        e2v_embedding.loc[int(key)] = pd.Series({'Node':int(key), 'Embedding':list(w2v_model.wv[key])})
        e2v_embedding_list.append(list(w2v_model.wv[key]))
        
    e2v_embedding = e2v_embedding.sort_values('Node')
    
    return M, e2v_embedding, e2v_embedding_list

In [16]:
get_e2v_embeddings(save=True)

Initializing transition matrix...
Updating transition matrix...
Walk iteration:
1 / 2
2 / 2
0th iteration of updating matrix




Walk iteration:
1 / 2
2 / 2
1th iteration of updating matrix
Walk iteration:
1 / 2
2 / 2
2th iteration of updating matrix
Walk iteration:
1 / 2
2 / 2
3th iteration of updating matrix
Walk iteration:
1 / 2
2 / 2
4th iteration of updating matrix
Walk iteration:
1 / 2
2 / 2
5th iteration of updating matrix
Walk iteration:
1 / 2
2 / 2
6th iteration of updating matrix
Walk iteration:
1 / 2
2 / 2
7th iteration of updating matrix
Walk iteration:
1 / 2
2 / 2
8th iteration of updating matrix
Walk iteration:
1 / 2
2 / 2
9th iteration of updating matrix
Finished generating values for transition matrix!
Overview of resulting edge type transition matrix (float values have been truncated to fit the view!):
0.5	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	0.5	1.0	1.0	1.0	0.5	0.5	0.5
1.0	0.5	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	0.5	1.0	1.0	0.5	0.5	0.5	0.5
1.0	1.0	0.5	1.0	1.0	1.0	1.0	1.0	0.999	1.0	1.0	0.999	0.999	0.999	0.999	0.989	0.5	0.5	0.5
1.0	1.0	1.0	0.5	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.

([[0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   0.5,
   0.5,
   0.5],
  [1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   0.5,
   0.5,
   0.5,
   0.5],
  [1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.9999999999999993,
   1.0,
   1.0,
   0.9999999999993086,
   0.9996646498695336,
   0.9999999997210531,
   0.9999999999622486,
   0.9890130573694068,
   0.5,
   0.5,
   0.5],
  [1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   0.5,
   0.5],
  [1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   0.5,
   0.5],
  [1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   0.9999251537724895,
  