# Import Libraries

In [1]:
import os
import pandas as pd
import networkx as nx
import numpy as np
import random
import pickle

import edge2vec.transition3 as transitions
import edge2vec.edge2vec3 as edge2vec

import data_params as input_data_params

# Set Parameters

Specify which dataset is used.
*   1 (Original knowledge graph)
*   2 (Restructured knowledge graph)

In [2]:
dataset_prefix = input_data_params.dataset

disease_prefix = input_data_params.disease
assert disease_prefix == 'dmd' or 'hd' or 'oi'

embedding_method = 'e2v'

use_seed = input_data_params.e2v_seed

if use_seed:
    fixed_emb = '_seeded'
else:
    fixed_emb = ''

print(dataset_prefix, disease_prefix)

prev hd


# Set Folder

In [3]:
curr_working_dir = os.path.dirname(os.getcwd())
curr_output_dir = os.path.join(curr_working_dir, 'output', disease_prefix)
dataset_output_dir = os.path.join(curr_output_dir, f'{dataset_prefix}_{embedding_method}{fixed_emb}')

if not os.path.exists(dataset_output_dir):
    os.mkdir(dataset_output_dir)
    print(f'Output folder for dataset {dataset_prefix} {disease_prefix} is created: {dataset_output_dir}')
else:
    print(f'Output folder for dataset {dataset_prefix} {disease_prefix} already exists and will be used: {dataset_output_dir}')

Output folder for dataset prev hd already exists and will be used: C:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v


In [4]:
def create_run_folder(run_nr: int):
    foldername = 'run_{:03d}'.format(run_nr)
    run_path = os.path.join(dataset_output_dir, foldername)
    os.mkdir(run_path)

    print(f'Output folder for current run on dataset {dataset_prefix} {disease_prefix} is created: {run_path}')
    return run_path

run_folders_list = []

for item in os.listdir(dataset_output_dir):
    curr_path = os.path.join(dataset_output_dir, item)
    if os.path.isdir(curr_path) and 'run' in item:
        run_folders_list.append(item)

curr_run_nr = 1
if len(run_folders_list) > 0:
    run_folders_list.sort(reverse=True)
    latest_run = run_folders_list[0]
    run_nr_str = latest_run.split('_')[1]

    next_run_nr = int(run_nr_str) + 1
    curr_run_nr = next_run_nr
    output_path = create_run_folder(next_run_nr)
else:
    output_path = create_run_folder(1)

Output folder for current run on dataset prev hd is created: C:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_010


In [5]:
def set_seed_val(run_nr: int):
    if use_seed:
        seed_val = 10   # Fixed seed for each run
        seed_txt_file = os.path.join(output_path, 'seed_value.txt')
        with open(seed_txt_file, 'w') as f:
            f.write(f'Seed value for this run is: {seed_val}')
        print(f'File {seed_txt_file} has been saved storing the seed value used during the embedding step of current run.')

        return seed_val
    else:
        return None

seed = set_seed_val(curr_run_nr)
print(f'Current value of e2v embedding seed is {seed}')

Current value of e2v embedding seed is None


# Load Nodes and Edges

In [6]:
nodes = pd.read_csv(f'../output/{disease_prefix}/{dataset_prefix}_{disease_prefix}_indexed_nodes.csv')
nodes

Unnamed: 0,index_id,id,semantic,label,semantic_id
0,0,ENSEMBL:ENSRNOG00000061099,ORTH,A0A0G2JVI3,5
1,1,ENSEMBL:ENSSSCG00000038558,ORTH,ENSEMBL:ENSSSCG00000038558,5
2,2,MP:0002682,DISO,decreased mature ovarian follicle number,1
3,3,ENSEMBL:ENSECAG00000014460,ORTH,ENSEMBL:ENSECAG00000014460,5
4,4,HP:0000736,DISO,Short attention span,1
...,...,...,...,...,...
14828,14828,ENSEMBL:ENSPTRG00000041525,ORTH,ENSEMBL:ENSPTRG00000041525,5
14829,14829,RGD:1308778,ORTH,Upf3a,5
14830,14830,ENSEMBL:ENSECAG00000023674,ORTH,ENSEMBL:ENSECAG00000023674,5
14831,14831,dictyBase:DDB_G0292406,GENO,rab4,4


In [7]:
edges = pd.read_csv(f'../output/{disease_prefix}/{dataset_prefix}_{disease_prefix}_indexed_edges.csv')
edges

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,FlyBase:FBgn0029629,eIF3g1,5,8560,interacts with,FlyBase:FBgn0034237,eIF3b,5,5659,0
1,ENSEMBL:ENSGALG00000041380,ENSEMBL:ENSGALG00000041380,5,8256,in orthology relationship with,Xenbase:XB-GENE-484711,mhc1b2,5,11308,1
2,ENSEMBL:ENSXETG00000009076,kit,5,4336,in orthology relationship with,MGI:95956,H2-T22,5,5744,1
3,ZFIN:ZDB-GENE-040426-2723,fez1,5,11226,in 1 to 1 orthology relationship with,ENSEMBL:ENSACAG00000000745,ENSEMBL:ENSACAG00000000745,5,6528,2
4,ENSEMBL:ENSCAFG00000015827,PFN1,5,4290,in orthology relationship with,ZFIN:ZDB-GENE-031002-33,pfn1,5,2133,1
...,...,...,...,...,...,...,...,...,...,...
228994,ENSEMBL:ENSGALG00000038504,ENSEMBL:ENSGALG00000038504,5,12490,in 1 to 1 orthology relationship with,ENSEMBL:ENSMODG00000017399,ENSEMBL:ENSMODG00000017399,5,9507,2
228995,ENSEMBL:ENSFCAG00000014170,ENSEMBL:ENSFCAG00000014170,5,4355,in 1 to 1 orthology relationship with,ENSEMBL:ENSBTAG00000012927,ALDOA,5,238,2
228996,ENSEMBL:ENSGALG00000033212,HSP90AA1,5,9770,is part of,GO:0005634,nucleus,6,4795,5
228997,ZFIN:ZDB-GENE-051202-1,grin1a,5,8741,in orthology relationship with,EnsemblGenome:AT2G32390,EnsemblGenome:AT2G32390,5,3388,1


In [8]:
nodes.loc[nodes['index_id'] == 1041]

Unnamed: 0,index_id,id,semantic,label,semantic_id
1041,1041,1990,DRUG,omeprazole,2


In [9]:
edge2vec_df = edges.copy()
edge2vec_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,FlyBase:FBgn0029629,eIF3g1,5,8560,interacts with,FlyBase:FBgn0034237,eIF3b,5,5659,0
1,ENSEMBL:ENSGALG00000041380,ENSEMBL:ENSGALG00000041380,5,8256,in orthology relationship with,Xenbase:XB-GENE-484711,mhc1b2,5,11308,1
2,ENSEMBL:ENSXETG00000009076,kit,5,4336,in orthology relationship with,MGI:95956,H2-T22,5,5744,1
3,ZFIN:ZDB-GENE-040426-2723,fez1,5,11226,in 1 to 1 orthology relationship with,ENSEMBL:ENSACAG00000000745,ENSEMBL:ENSACAG00000000745,5,6528,2
4,ENSEMBL:ENSCAFG00000015827,PFN1,5,4290,in orthology relationship with,ZFIN:ZDB-GENE-031002-33,pfn1,5,2133,1
5,ZFIN:ZDB-GENE-081104-199,rnf216,5,11222,in orthology relationship with,ENSEMBL:ENSSSCG00000005914,SHARPIN,5,539,1
6,ZFIN:ZDB-GENE-090313-280,si:dkey-253i9.4,5,13340,in orthology relationship with,dictyBase:DDB_G0289665,gefQ,4,19,1
7,ENSEMBL:ENSMODG00000011702,ENSEMBL:ENSMODG00000011702,5,1872,in 1 to 1 orthology relationship with,ZFIN:ZDB-GENE-040426-1851,wdr61,5,3311,2
8,ZFIN:ZDB-GENE-021030-4,cad,5,7440,has phenotype,ZP:0003997,"cerebellum decreased size, abnormal",1,1378,3
9,FlyBase:FBgn0283472,S6k,5,7950,has phenotype,FBcv:0000717,increased cell growth,1,1233,3


# Edge2Vec Embedding Preparation

Initialize edge list with keys 'type' and 'weight'. The 'weight' of each edge is initialized to 1.0. 

In [10]:
G = nx.from_pandas_edgelist(edge2vec_df, 'index_head', 'index_tail', 'type', create_using=nx.DiGraph(), edge_key= (('type', int),('id', int)))
G = G.to_undirected()   # for the E2V implementation, use indirected graph
for edge in G.edges():
    G[edge[0]][edge[1]]['weight'] = 1.0
    
for node in G.nodes(data=True):
    print('First node in graph:', node)
    break

for edge in G.edges(data=True):
    print('First edge in graph:', edge)
    break

print('Total number of edges is {}'.format(G.number_of_edges()))
print('Total number of nodes is {}'.format(G.number_of_nodes()))

First node in graph: (8560, {})
First edge in graph: (8560, 5659, {'type': 0, 'weight': 1.0})
Total number of edges is 134749
Total number of nodes is 14833


Count total number of edge types

In [11]:
print(set(edge2vec_df['type']))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, -1}


In [12]:
edge2vec_df.loc[edge2vec_df['type'] == -1].head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
166,MGI:2675620,Htt<tm1Mfc>/Htt<tm1Mfc> [involves: 129S1/Sv * ...,5,12824,,MGI:96067,Htt,5,4859,-1
489,MGI:5575509,Tg(Prnp-SNAP25/HTT*150Q)8Xjl [involves: FVB],5,12074,,HGNC:4851,HTT,3,12425,-1
744,MGI:4438239,Tg(HTT*)BXwy [FVB-Tg(HTT*)BXwy],5,869,,HGNC:4851,HTT,3,12425,-1
885,MGI:1861935,Htt<tm5Mem>,5,4868,,MGI:3698041,Htt<tm5Mem>/Htt<+> [involves: 129S1/Sv * 129X1...,5,33,-1
2180,MGI:3528066,Tg(HDexon1)62Gpb; Tgm2<tm1.1Rmgr>/Tgm2<tm1.1Rm...,5,3669,,HGNC:4851,HTT,3,12425,-1
4864,MGI:6400710,Htt<tm5Detl>/Htt<+> [involves: C57BL/6],5,13227,,MGI:96067,Htt,5,4859,-1
7499,MGI:4438240,Htt<tm1Szi>/Htt<tm1Szi>; Tg(HTT*)BXwy [involve...,5,5133,,HGNC:4851,HTT,3,12425,-1
7755,MGI:2177756,Htt<tm1Detl>,5,14344,,MGI:3573928,Htt<tm1Detl>/Htt<+> [involves: 129P2/OlaHsd * ...,5,1017,-1
8179,MGI:3586839,Htt<tm1Hay>/Htt<+> [involves: C57BL/6J],5,3074,,MGI:96067,Htt,5,4859,-1
10803,MGI:2429756,Tg(YAC72)2511Hay,5,2119,,MGI:5298846,Htt<tm1Hay>/Htt<+>; Tg(YAC72)2511Hay [involves...,5,10115,-1


In [13]:
type_size = len(set(edge2vec_df['type']))
print(f'There are {type_size} edge types')

There are 22 edge types


# Obtain Node Embeddings Using Edge2Vec

Define parameters and generate edge type transition matrix using Edge2Vec EM approach

In [14]:
e2v_params = {
    'dmd': {
        'prev': {
            'epochs': 10,
            'num_walks': 2,
            'walk_length': 7,
            'p': 0.70,
            'q': 1.0,
            'dim': 32
        },
        'restr': {
            'epochs': 5,
            'num_walks': 6,
            'walk_length': 7,
            'p': 0.75,
            'q': 1.0,
            'dim': 64
        }
    },
    'hd': {
        'prev': {
            'epochs': 10,
            'num_walks': 6,
            'walk_length': 7,
            'p': 0.5,
            'q': 0.75,
            'dim': 64
        },
        'restr': {
            'epochs': 10,
            'num_walks': 2,
            'walk_length': 7,
            'p': 1.0,
            'q': 1.0,
            'dim': 128
        }
    },
    'oi': {
        'prev': {
            'epochs': 10,
            'num_walks': 6,
            'walk_length': 7,
            'p': 1.0,
            'q': 0.5,
            'dim': 128
        },
        'restr': {
            'epochs': 10,
            'num_walks': 4,
            'walk_length': 7,
            'p': 0.5,
            'q': 0.5,
            'dim': 32
        }
    }
}
    
directed = True
e_step = 3

params_to_use = e2v_params[disease_prefix][dataset_prefix]

epochs = params_to_use['epochs']
num_walks = params_to_use['num_walks']
walk_length = params_to_use['walk_length']
p = params_to_use['p']
q = params_to_use['q']
dim = params_to_use['dim']

print(epochs, num_walks, walk_length, dim, p, q)

10 6 7 64 0.5 0.75


In [15]:
def get_e2v_embeddings(save=False):
    print('Initializing transition matrix...')
    M = transitions.initialize_edge_type_matrix(type_size)

    print('Updating transition matrix...')
    for i in range(epochs):
        walks = transitions.simulate_walks_1(G, num_walks, walk_length, M, directed, p, q, seed) # M step
        print(f'{i}th iteration of updating matrix')
        M = transitions.update_trans_matrix(walks, type_size, e_step) # E step

    print("Finished generating values for transition matrix!")
    
    print('Overview of resulting edge type transition matrix (float values have been truncated to fit the view!):')
    print('\n'.join(['\t'.join([str(cell)[:5] for cell in row]) for row in M]))
    
    if save:
        # Save transition matrix
        np.save(f'{output_path}/transitionmatrix_{dataset_prefix}_{disease_prefix}.npy', M)
    
    print('Generate walks constrained by edge type transition matrix...')
    walks = edge2vec.simulate_walks_2(G, num_walks, walk_length, M, p, q, seed)
    
    # Generate node embeddings using Word2Vec (skip-gram model) with as input the generated walks 
    window_size = walk_length - 1   # maximum distance between predicted and context node
    workers = 8 # threads used

    w2v_model = edge2vec.Word2Vec(walks, vector_size=dim, window=window_size, min_count=0, sg=1, workers=workers, epochs=epochs, seed=seed)
    
    word_vectors = w2v_model.wv
    if save:
        word_vectors.save(f'{output_path}/w2v_{dataset_prefix}_{disease_prefix}.dvectors')
        
    e2v_embedding = pd.DataFrame(columns = ['Node', 'Embedding'])
    e2v_embedding_list = []
    for _, key in enumerate(w2v_model.wv.index_to_key):
        e2v_embedding.loc[int(key)] = pd.Series({'Node':int(key), 'Embedding':list(w2v_model.wv[key])})
        e2v_embedding_list.append(list(w2v_model.wv[key]))
        
    e2v_embedding = e2v_embedding.sort_values('Node')
    
    return M, e2v_embedding, e2v_embedding_list

In [16]:
get_e2v_embeddings(save=True)

Initializing transition matrix...
Updating transition matrix...
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
0th iteration of updating matrix




Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
1th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
2th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
3th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
4th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
5th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
6th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
7th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
8th iteration of updating matrix
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
9th iteration of updating matrix
Finished generating values for transition matrix!
Overview of resulting edge type transition matrix (float values have been truncated to fit the view!):
0.5	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	0.5	1.0	1.0	0.5	0.

([[0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   0.5,
   0.5,
   0.5],
  [1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   0.5,
   0.5,
   0.5,
   0.5,
   0.5],
  [1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   0.5,
   1.0,
   0.5,
   0.5,
   0.5],
  [1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.9933071490757153,
   1.0,
   1.0,
   0.5,
   0.9996646498695336,
   0.9996646498695336,
   0.5,
   0.5,
   0.5],
  [1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   0.5,
   0.5,
   0.5,
   0.5],
  [1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0