In [1]:
import argparse
import gzip, os, csv
import numpy as np
import random
import time
import networkx as nx

In [22]:
dirname = 'db_kegg_pgk_graph'
entity2id_file= open(os.path.join(dirname, 'entity2id.txt'),'r')
relation2id_file = open(os.path.join(dirname, 'relation2id.txt'),'r')
entity2id ={}
relation2id = {}
for line in entity2id_file:
    line  = line.strip().split()
    if len(line) == 2:
        entity2id[line[0]] = int(line[1]) 

for line in relation2id_file:
    line  = line.strip().split()
    if len(line) == 2:
        relation2id[line[0]] = int(line[1]) 



In [23]:
class MySentences(object):
    def __init__(self, dirname, filename):
        self.dirname = dirname
        self.filename = filename

    def __iter__(self):
        for subfname in os.listdir(self.dirname):
            if not self.filename in subfname: continue
            fpath = os.path.join(self.dirname, subfname)
            print ('Processing ',subfname)
            for fname in os.listdir(fpath):
                if not 'part' in fname: continue
                if '.crc' in fname: continue
                try:
                    for line in open(os.path.join(fpath, fname), mode='r'):
                        line = line.rstrip('\n')
                        words = line.split("->")
                        yield words
                except Exception:
                    print("Failed reading file:")
                    print(fname)


In [24]:
def extractFeatureVector(model, drugs, id2entity, output): 
  
    header="Drug"
    ns = "n"
    first = ns+str(drugs[0])

    for i in range(len(model.wv[first])):
        header=header+"\tfeature"+str(i)
        
    fw=open(output,'w')
    fw.write(header+"\n")

    for id_ in sorted(drugs):
        nid =ns+str(id_)
        if  (nid) not in  model.wv:
            print (nid)
            continue
        vec = model.wv[nid]
        vec = "\t".join(map(str,vec))
        fw.write( id2entity[id_]+'\t'+str(vec)+'\n')
    fw.close()
    

def extractFeatureVector_(model, entity2id, output): 
    drugsfilename = '../rdfvec/drubankids_ddi_v5.txt'
    drugs = set()
 
    drugsfile = open(drugsfilename)
    #print (drugsfile.next())
    for l in drugsfile:
        #l=l.strip().replace('"','').replace("http://bio2rdf.org/kegg:","kegg:").replace("http://bio2rdf.org/drugbank:","db:").replace("http://bio2rdf.org/pharmgkb:","pharmgkb:")
        l= l.split()
        drug = '<'+l[1]+'>'
        if drug not in entity2id: continue
        did  = 'n'+str(entity2id[drug])  
        drugs.add((drug,did))

    header="Drug"
    ns = "n"
    first = list(drugs)[100]
    first = '85141'

    for i in range(len(model.wv[ns+first])):
        header=header+"\tfeature"+str(i)

    fw=open(output,'w')
    fw.write(header+"\n")

    for drug in sorted(drugs):
        dbid=drug[0].replace('<http://bio2rdf.org/drugbank:','').replace('>','')
        nid=drug[1]
        if  (nid) not in  model.wv:
            print (nid)
            continue
        vec = model.wv[nid]
        vec = "\t".join(map(str,vec))
        fw.write( dbid+'\t'+str(vec)+'\n')
    fw.close()

In [25]:

import gensim


def trainModel(drugs,id2entity, datafilename, model_output, vector_output, pattern, maxDepth):
    
    if not os.path.isdir(model_output):
        os.mkdir(model_output)
        
    if not os.path.isdir(vector_output):
        os.mkdir(vector_output)
    
    output = model_output + pattern +'/'
    if not os.path.isdir(output):
        os.mkdir(output)
    
    sentences = MySentences(datafilename, filename=pattern) # a memory-friendly iterator
    model = gensim.models.Word2Vec(size=200, workers=5, window=5, sg=1, negative=15, iter=5)
    #print sentences
    model.build_vocab(sentences)
    corpus_count = model.corpus_count
    #sg/cbow features iterations window negative hops random walks
    del model
    model1 = gensim.models.Word2Vec(size=200, workers=8, window=5, sg=1, negative=15, iter=5)
    model1.build_vocab(sentences)

    #model1.train(sentences)
    model1.train(sentences, total_examples=corpus_count, epochs =5)
    modelname = 'Drug2Vec_sg_200_5_5_15_2_500'+'_d'+str(maxDepth)
    model1.save(output+modelname)
    
    extractFeatureVector(model1, drugs, id2entity, vector_output+modelname+'_'+pattern+'.txt')
    
    #cbow 200
    del model1
    model2 = gensim.models.Word2Vec(size=200, workers=8, window=5, sg=0, iter=5,cbow_mean=1, alpha = 0.05)
    model2.build_vocab(sentences)

    model2.train(sentences, total_examples=corpus_count, epochs =5)
    modelname = 'Drug2Vec_cbow_200_5_5_2_500'+'_d'+str(maxDepth)
    model2.save(output+ modelname)
    extractFeatureVector(model2, drugs, id2entity, vector_output+modelname+'_'+pattern+'.txt')
    del model2
    

In [26]:
drugsfilename = 'data/input/drubankids_ddi_v5.txt'
drugs = set()

drugsfile = open(drugsfilename)
for l in drugsfile:
    l= l.split()
    drug = '<'+l[1]+'>'
    if drug not in entity2id: continue
    did  = 'n'+str(entity2id[drug])  
    drugs.add(did)

In [None]:
dataset = 'DB_KEGG_PGK/'
datafilename = './walks/'+dataset
model_output = './models/'+dataset    
pattern = 'uniform'
vector_output =  'vectors/'+dataset
trainModel(drugs, entity2id, datafilename, model_output, vector_output, pattern, maxDepth=4)

Processing  randwalks_n250_depth4_uniform.txt
Processing  randwalks_n250_depth1_uniform.txt
Processing  randwalks_n250_depth3_uniform.txt
Processing  randwalks_n250_depth2_uniform.txt
Processing  randwalks_n250_depth4_uniform.txt
Processing  randwalks_n250_depth1_uniform.txt
Processing  randwalks_n250_depth3_uniform.txt
Processing  randwalks_n250_depth2_uniform.txt
Processing  randwalks_n250_depth4_uniform.txt


In [8]:
dataset = 'DB_KEGG_PGK/'
datafilename = './walks/'+dataset
model_output = './models/'+dataset    
pattern = 'pagerank_split'
vector_output =  'vectors/'+dataset
trainModel(drugs, entity2id, datafilename, model_output, vector_output, pattern, maxDepth=4)

Processing  randwalks_n250_depth3_pagerank_split.txt
Processing  randwalks_n250_depth1_pagerank_split.txt
Processing  randwalks_n250_depth4_pagerank_split.txt
Processing  randwalks_n250_depth2_pagerank_split.txt
Processing  randwalks_n250_depth3_pagerank_split.txt
Processing  randwalks_n250_depth1_pagerank_split.txt
Processing  randwalks_n250_depth4_pagerank_split.txt
Processing  randwalks_n250_depth2_pagerank_split.txt
Processing  randwalks_n250_depth3_pagerank_split.txt
Processing  randwalks_n250_depth1_pagerank_split.txt
Processing  randwalks_n250_depth4_pagerank_split.txt
Processing  randwalks_n250_depth2_pagerank_split.txt
Processing  randwalks_n250_depth3_pagerank_split.txt
Processing  randwalks_n250_depth1_pagerank_split.txt
Processing  randwalks_n250_depth4_pagerank_split.txt
Processing  randwalks_n250_depth2_pagerank_split.txt
Processing  randwalks_n250_depth3_pagerank_split.txt
Processing  randwalks_n250_depth1_pagerank_split.txt
Processing  randwalks_n250_depth4_pagerank_spl

In [9]:
dataset = 'DB_KEGG_PGK/'
datafilename = './walks/'+dataset
model_output = './models/'+dataset    
pattern = 'pagerank_pushdown'
vector_output =  'vectors/'+dataset
trainModel(drugs, entity2id, datafilename, model_output, vector_output, pattern, maxDepth=4)

Processing  randwalks_n250_depth2_pagerank_pushdown.txt
Processing  randwalks_n250_depth3_pagerank_pushdown.txt
Processing  randwalks_n250_depth4_pagerank_pushdown.txt
Processing  randwalks_n250_depth1_pagerank_pushdown.txt
Processing  randwalks_n250_depth2_pagerank_pushdown.txt
Processing  randwalks_n250_depth3_pagerank_pushdown.txt
Processing  randwalks_n250_depth4_pagerank_pushdown.txt
Processing  randwalks_n250_depth1_pagerank_pushdown.txt
Processing  randwalks_n250_depth2_pagerank_pushdown.txt
Processing  randwalks_n250_depth3_pagerank_pushdown.txt
Processing  randwalks_n250_depth4_pagerank_pushdown.txt
Processing  randwalks_n250_depth1_pagerank_pushdown.txt
Processing  randwalks_n250_depth2_pagerank_pushdown.txt
Processing  randwalks_n250_depth3_pagerank_pushdown.txt
Processing  randwalks_n250_depth4_pagerank_pushdown.txt
Processing  randwalks_n250_depth1_pagerank_pushdown.txt
Processing  randwalks_n250_depth2_pagerank_pushdown.txt
Processing  randwalks_n250_depth3_pagerank_pushd