In [1]:
import os
import time
import random
import pickle
import tqdm

In [2]:
import pandas as pd
import numpy as np
import pecanpy as pp
import networkx as nx

In [3]:
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph
from gensim.models import Word2Vec

#### Load graphs

In [4]:
with open('./graphs.pkl', 'rb') as f:
    dataset = pickle.load(f)
#dataset: dict with graph ids as keys

In [5]:
grn_ids = list(dataset.keys())

In [6]:
len(grn_ids)

248

In [7]:
GRN_FOLDER="./links/"
GRN_NODE2VEC_FOLDER="./Node2Vec/"
NODE2VEC_EDGE_LIST_FOLDER = "./Node2Vec/EdgeLists/"

In [8]:
celltypes = ['Astro', 'EN_L2_3_IT', 'EN_L3_5_IT_1', 'EN_L3_5_IT_2', 'EN_L3_5_IT_3', 
            'IN_ADARB2', 'IN_LAMP5', 'IN_SST', 'IN_VIP', 'Micro_PVM', 'OPC', 'Oligo']

#### PecanPy

In [9]:
runs = 20  # Number of runs
folds = 5  # Folds per run
run, fold = 0, 0  # Current stat
percentile = 80  # Data filtering
param_grid = {
    'dim': [16, 32, 64, 128, 248, 512],
    'num_walks': [20, 40, 60, 80, 100],
    'walk_length': [10],
    'dropout': [0, .4, .6, .8],
    'gamma': [.96, .99, .995],
    'lr': [1e-1, 1e-2, 1e-3, 1e-4],
}
np.random.seed(42)
dim_list = np.random.choice(param_grid['dim'], runs)
num_walks_list = np.random.choice(param_grid['num_walks'], runs)
walk_length_list = np.random.choice(param_grid['walk_length'], runs)
dropout_list = np.random.choice(param_grid['dropout'], runs)
gamma_list = np.random.choice(param_grid['gamma'], runs)
lr_list = np.random.choice(param_grid['lr'], runs)

In [10]:
dim=16
num_walks=20
walk_length=10
verbose = False

In [12]:
for patient_id in tqdm.tqdm(grn_ids[164:]):
    grn_file = patient_id+"_grn.csv"
    if not os.path.exists(NODE2VEC_EDGE_LIST_FOLDER+patient_id):
        os.makedirs(NODE2VEC_EDGE_LIST_FOLDER+patient_id)
        
    try:
        if grn_file.endswith("csv"):
            grn = pd.read_csv(GRN_FOLDER+grn_file)
            grn_ctypes=grn.groupby("ctype")
            c_type_dict={}

            for n,g_ctype in grn_ctypes: 
                if n in celltypes:
                    src=g_ctype["source"].to_list()
                    trg=g_ctype["target"].to_list()

                    edges = [x for x in zip(src,trg)]
                    G = nx.DiGraph()
                    G.add_edges_from(edges)
                    edg_fname=NODE2VEC_EDGE_LIST_FOLDER+patient_id+"/"+n+"_elist.edg"
                    nx.write_edgelist(G, edg_fname)

                    g = pp.pecanpy.SparseOTF(p=1, q=1, workers=4, verbose=False)
                    emb_fname = f'{NODE2VEC_EDGE_LIST_FOLDER}/{patient_id}/{n}_dim{dim}_emb.npy'

                    if os.path.exists(emb_fname):
                        if verbose:
                            print('Loading embeddings...')
                        emb_pp = np.load(emb_fname)
                    else:
                        if verbose:
                            print('Generating embeddings...')
                        g.read_edg(NODE2VEC_EDGE_LIST_FOLDER+patient_id+"/"+n+"_elist.edg", 
                                   weighted=False, directed=True, delimiter=" ")
                        emb_pp = g.embed(dim=dim, num_walks=num_walks, walk_length=walk_length)
                        np.save(emb_fname, emb_pp)
                
    except Exception as e: 
        print (e)
                

100%|███████████████████████████████████████████| 84/84 [42:58<00:00, 30.69s/it]


#### Word2Vec

In [75]:
def node2vec(graph_df, length=20, n=10, vec_size=64):
    
    G = StellarGraph(edges=graph_df)
    rw = BiasedRandomWalk(G)
    walks = rw.run(
        nodes=list(G.nodes()),  # root nodes
        length=length,  # maximum length of a random walk
        n=n,  # number of random walks per root node
        p=2,  # Defines (unormalised) probability, 1/p, of returning to source node
        q=0.5,  # Defines (unormalised) probability, 1/q, for moving away from source node
    )
    model = Word2Vec(walks, vector_size=vec_size, window=5, min_count=0, sg=1, workers=20, epochs=5)
    
    # Retrieve node embeddings and corresponding subjects
    node_ids = model.wv.index_to_key  # list of node IDs
    node_embeddings = (model.wv.vectors)
    
    node_embed = pd.DataFrame(node_embeddings)     
    node_embed.index = node_ids
    
#     node_embed = node_embed.loc[nodes]
#     node_embed.to_csv("./")
    
    return node_embed, walks

In [76]:
GRN_node2vec_Dataset=dict()
incorrect_patients_grn = {}
walk_length=20
walk_per_node = 10
vec_size=64

In [1]:
for patient_id in tqdm.tqdm(grn_ids):
    grn_file = patient_id+"_grn.csv"
    try:
        if grn_file.endswith("csv"):
            grn = pd.read_csv(GRN_FOLDER+grn_file)
            grn_ctypes=grn.groupby("ctype")
            c_type_dict={}

            for n,g_ctype in grn_ctypes:   

                src=g_ctype["source"].to_list()
                trg=g_ctype["target"].to_list()

                src_trg_genes = sorted(list(set(src+trg)))
                src_idxs = [src_trg_genes.index(gene) for gene in src]
                trg_idxs = [src_trg_genes.index(gene) for gene in trg]

                graph_df = pd.DataFrame({"source":src, "target":trg})
                node_embed, walks = node2vec(graph_df)
                c_type_dict[n]=node_embed
                
            GRN_node2vec_Dataset[patient_id] = c_type_dict
    
    except Exception as e: 
        incorrect_patients_grn[patient_id] = e

In [66]:
node2vec_grn_file = 'GRN_node2vec_'+str(vec_size)+'dim_'+str(walk_length)+'rw_'+str(walk_per_node)+'walkpernode.pkl'
node2vec_grn_file

'GRN_node2vec_64dim_20rw_10walkpernode.pkl'

In [None]:
with open(GRN_NODE2VEC_FOLDER +node2vec_grn_file, 'wb') as f:
    pickle.dump(GRN_node2vec_Dataset, f)