In [65]:
import sys
import csv, re, copy
import tensorflow as tf
import tensorlayer as tl
from tensorlayer.layers import *
import numpy as np
import time
from collections import Counter
from gensim.models import KeyedVectors
from pycorenlp import StanfordCoreNLP
from nltk.tree import Tree
nlp = StanfordCoreNLP('http://localhost:9000')

In [35]:
def loadSentences(filename):
    f = open(filename, encoding="utf8")
    reader = csv.DictReader(f, delimiter='\t')
    sentences = [row for row in reader]
    return sentences

In [84]:
def loadAllTextSentences():
    sentences = loadSentences('dataset/Training.sentence')
    sentences.extend(loadSentences('dataset/SampleSet.sentence'))
    sentences.extend(loadSentences('dataset/Task1NeuV3_corrected.sentence'))
    # print(len(sentences))
    TextSentenceID = dict()
    vocabulary = set()
    for line in sentences:
        id = line['Sentence-ID'][4:]
        text = line['Sentence']
        output = nlp.annotate(text, properties={'annotators': 'tokenize', 'outputFormat': 'json'})
#         print('---------------------------------------------')
        if id not in TextSentenceID:
            TextSentenceID[id] = {'id': id,
                                  'text': text,
                                  'pmid': line['PMID'],
                                  'tokens': [i['word'] for i in output['tokens']]}
            vocabulary = vocabulary.union(set(TextSentenceID[id]['tokens']))
    #     else:
    #         assert (TextSentenceID[id]['text'] == line['Sentence']), 'ID: %s \n Text1: %s \n Text2: %s'%(id, TextSentenceID[id]['text'], line['Sentence']) 
    #         assert (TextSentenceID[id]['pmid'] == line['PMID']), 'ID: %s \n PMID1: %s \n PMID2: %s'%(id, TextSentenceID[id]['pmid'], line['PMID'])
    print('Download text sentences:', len(TextSentenceID), 'sentences')
    # print(TextSentenceID['10000072']) 
    # {'id': '10000072', 'text': 'it was found that a 6-fold increase in Fdft1 activity compared with that of the wild-type did not cause significant changes in HmgCoA reductase activity, while the amounts of synthesized dolichols and ergosterols increased by 80 and 32 percent respectively.', 'pmid': '10623644'}
    return TextSentenceID, vocabulary

In [85]:
TextSentenceID, vocabulary = loadAllTextSentences()
print(TextSentenceID['10000072'])
print(len(vocabulary))

Download text sentences: 6536 sentences
{'id': '10000072', 'text': 'it was found that a 6-fold increase in Fdft1 activity compared with that of the wild-type did not cause significant changes in HmgCoA reductase activity, while the amounts of synthesized dolichols and ergosterols increased by 80 and 32 percent respectively.', 'pmid': '10623644', 'tokens': ['it', 'was', 'found', 'that', 'a', '6-fold', 'increase', 'in', 'Fdft1', 'activity', 'compared', 'with', 'that', 'of', 'the', 'wild-type', 'did', 'not', 'cause', 'significant', 'changes', 'in', 'HmgCoA', 'reductase', 'activity', ',', 'while', 'the', 'amounts', 'of', 'synthesized', 'dolichols', 'and', 'ergosterols', 'increased', 'by', '80', 'and', '32', 'percent', 'respectively', '.']}
19036


In [86]:
word_vectors = KeyedVectors.load_word2vec_format('word_embeddings/PubMed-and-PMC-w2v.bin', binary=True)
print(word_vectors['Increases'])

[-1.25327557e-01 -2.05734119e-01  2.20678654e-02  1.27095148e-01
  4.70568202e-02  3.66582334e-01  1.80289820e-01 -6.96827620e-02
  5.25160849e-01  2.50934307e-02  1.86377347e-01 -1.57668844e-01
  5.11006951e-01  2.82196283e-01 -1.45905316e-01 -1.02183104e-01
 -1.58878171e-03 -2.69769728e-01  4.36125807e-02 -3.74512225e-02
  1.44765481e-01 -1.72953263e-01  5.64784929e-02  2.03118950e-01
 -2.29118302e-01 -3.89206707e-01  1.89598396e-01  8.48720893e-02
 -2.92850465e-01 -1.89046666e-01  3.03188503e-01 -4.85944226e-02
  2.32507274e-01  1.78006619e-01  9.79960859e-02  6.02323450e-02
  1.65033221e-01 -3.79372507e-01  1.18517898e-01 -1.47823170e-01
  1.21478774e-01 -2.50081658e-01  2.41490863e-02  1.28086820e-01
  3.87153685e-01 -1.73163749e-02 -1.84716210e-01 -2.07878187e-01
  9.35073644e-02  3.20446283e-01  6.42037690e-02 -4.05614406e-01
  7.49878660e-02 -1.26757715e-02  1.44438535e-01 -3.08646530e-01
  1.06738694e-02  2.82481462e-01 -2.62360632e-01 -2.80956089e-01
 -1.86210290e-01 -1.15877

In [101]:
emb_dim = 200
T_vocab_size = 50000
T_idx2w = ['_', 'unk'] + list(vocabulary) 
for word in word_vectors.vocab.keys():
    if word not in T_idx2w:
        T_idx2w.append(word)
    if len(T_index2w) >= T_vocab_size + 2:
        break
T_idx2w.extend(['start_id', 'end_id'])
T_w2idx = dict([(T_idx2w[i], i) for i in range(len(T_idx2w))])
T_vocab_size_total = len(T_idx2w)
word_embedding = np.random.uniform(-0.1, 0.1, (2, emb_dim))
count = 2
for i in range(2, T_vocab_size_total-2):
    if T_idx2w[i] in word_vectors.vocab:
#         print(word_embedding.shape, np.array([word_vectors[T_idx2w[i]]]).shape)
        word_embedding = np.append(word_embedding, [word_vectors[T_idx2w[i]]], axis = 0)
    else:
        word_embedding = np.append(word_embedding, [np.random.uniform(-0.1, 0.1, emb_dim)], axis = 0)
        count += 1
word_embedding = np.append(word_embedding, [np.random.uniform(-0.1, 0.1, emb_dim)], axis = 0)
print(word_embedding[T_w2idx['increases']])
print(count+2)

[-1.77897915e-01 -6.07956201e-02  1.82623565e-01  9.40433890e-03
 -4.57968228e-02  1.06524616e-01  3.05105671e-02 -1.31171560e-02
  2.61207402e-01 -7.96180367e-02  2.10063905e-01 -2.63688445e-01
  3.40338528e-01  4.86342199e-02 -1.47659093e-01 -4.99520265e-02
 -6.67950213e-02 -3.32690239e-01 -5.26371263e-02  1.95578754e-01
  3.12451199e-02 -4.54581976e-02 -1.43305317e-01  6.22850917e-02
 -1.26622334e-01 -1.32162586e-01  1.25042289e-01 -6.89747334e-02
  6.96995528e-03 -2.80402958e-01  8.84695575e-02 -2.48210698e-01
  4.07526689e-03 -1.68188468e-01  4.15355772e-01  8.11652094e-02
  1.10174470e-01  5.04403608e-03  1.02366670e-03 -1.77017331e-01
  4.66788188e-02 -1.58089146e-01  1.15800232e-01 -9.48347151e-02
  1.79338381e-01  1.55628501e-02 -7.92423785e-02 -2.33671039e-01
  1.54874176e-01  2.62574106e-02 -2.71606948e-02 -2.14085847e-01
 -3.42812538e-02  2.45975465e-01 -1.43955141e-01 -1.01725556e-01
 -9.12490115e-03  1.26160726e-01 -1.70875147e-01 -1.63989186e-01
 -1.38413506e-02  5.44368

In [93]:
print(word_vectors['increases'])
print(word_vectors.vocab['_'])
# for k in word_vectors.vocab.keys():
#     print(k)

[-1.77897915e-01 -6.07956201e-02  1.82623565e-01  9.40433890e-03
 -4.57968228e-02  1.06524616e-01  3.05105671e-02 -1.31171560e-02
  2.61207402e-01 -7.96180367e-02  2.10063905e-01 -2.63688445e-01
  3.40338528e-01  4.86342199e-02 -1.47659093e-01 -4.99520265e-02
 -6.67950213e-02 -3.32690239e-01 -5.26371263e-02  1.95578754e-01
  3.12451199e-02 -4.54581976e-02 -1.43305317e-01  6.22850917e-02
 -1.26622334e-01 -1.32162586e-01  1.25042289e-01 -6.89747334e-02
  6.96995528e-03 -2.80402958e-01  8.84695575e-02 -2.48210698e-01
  4.07526689e-03 -1.68188468e-01  4.15355772e-01  8.11652094e-02
  1.10174470e-01  5.04403608e-03  1.02366670e-03 -1.77017331e-01
  4.66788188e-02 -1.58089146e-01  1.15800232e-01 -9.48347151e-02
  1.79338381e-01  1.55628501e-02 -7.92423785e-02 -2.33671039e-01
  1.54874176e-01  2.62574106e-02 -2.71606948e-02 -2.14085847e-01
 -3.42812538e-02  2.45975465e-01 -1.43955141e-01 -1.01725556e-01
 -9.12490115e-03  1.26160726e-01 -1.70875147e-01 -1.63989186e-01
 -1.38413506e-02  5.44368

In [91]:
print(emb_dim)
print(word_vectors['unk'])

4087446
[ 5.75057864e-01  7.52782375e-02 -3.01809937e-01  1.01989359e-02
 -3.28983128e-01 -3.13937850e-02  6.51189238e-02 -1.40033633e-01
  5.46455204e-01  1.05655245e-01 -5.59008531e-02 -3.08947384e-01
  1.48965105e-01 -2.30761945e-01  1.62432060e-01  3.47166389e-01
  1.78274699e-02  2.16196924e-01 -2.33157679e-01  1.75116267e-02
  1.90005854e-01 -2.58263722e-02  3.54427963e-01 -9.64064226e-02
  6.08074255e-02  6.34836440e-04  2.44468406e-01  4.54809636e-01
 -1.16582677e-01 -6.61844313e-02  1.06634788e-01 -1.33135706e-01
  6.68184906e-02  4.18486178e-01  1.87044203e-01  4.02310371e-01
 -4.54130545e-02  3.69635940e-01 -2.86800236e-01 -1.83732420e-01
  5.36106646e-01 -6.98395818e-02 -1.47025902e-02  1.05892718e-01
 -7.59536102e-02 -2.90077984e-01  4.18996289e-02  5.22052586e-01
  3.80992472e-01 -2.62682699e-02  2.84932703e-02  2.50182487e-02
 -2.45723184e-02 -1.82328582e-01  1.48312569e-01 -4.10612971e-01
 -1.80383157e-02 -1.23839051e-01 -6.49206415e-02  1.05875485e-01
 -5.05968809e-01 

In [None]:
            net_encode = EmbeddingInputlayer(
                inputs = encode_seqs,
                vocabulary_size = E_vocabSizeTotal,
                embedding_size = emb_dim,
                name = 'encode_seq_embedding')
            net_decode = EmbeddingInputlayer(
                inputs = decode_seqs,
                vocabulary_size = V_vocabSizeTotal,
                embedding_size = emb_dim,
                name = 'decode_seq_embedding')
            vs.reuse_variables()
            tl.layers.set_name_reuse(True)
        net_rnn = Seq2Seq(net_encode, net_decode,
                cell_fn = tf.contrib.rnn.BasicLSTMCell,
                n_hidden = emb_dim,
                initializer = tf.random_uniform_initializer(-0.1, 0.1),
                encode_sequence_length = retrieve_seq_length_op2(encode_seqs),
                decode_sequence_length = retrieve_seq_length_op2(decode_seqs),
                initial_state_encode = None,
                dropout = (0.5 if is_train else None),
                n_layer = 3,
                return_seq_2d = True,
                name = 'seq2seq')
        net_out = DenseLayer(net_rnn, n_units=V_vocabSizeTotal, act=tf.identity, name='output')
    return net_out, net_rnn

In [2]:
## Step 1: Build the embedding matrix and load the existing embedding matrix.
vocabulary_size = 5000
embedding_size = 200

print("Load existing embedding matrix and dictionaries")

x = tf.placeholder(tf.int32)

emb_net = tl.layers.EmbeddingInputlayer(
                inputs = x,
                vocabulary_size = vocabulary_size,
                embedding_size = embedding_size,
                name ='embedding_layer')

Load existing embedding matrix and dictionaries
  [TL] EmbeddingInputlayer embedding_layer: (5000, 200)


TypeError: Fetch argument <tensorlayer.layers.EmbeddingInputlayer object at 0x000002100AC0A9B0> has invalid type <class 'tensorlayer.layers.EmbeddingInputlayer'>, must be a string or Tensor. (Can not convert a EmbeddingInputlayer into a Tensor or Operation.)

In [6]:
sess = tf.InteractiveSession()

# sess.run(tf.initialize_all_variables())
tl.layers.initialize_global_variables(sess)

# tl.files.assign_params(sess, [load_params[0]], emb_net)
# print(sess.run(emb_net, {x:[1000, 1200]}))

emb_net.print_params()
emb_net.print_layers()

vector = sess.run(emb_net.outputs, feed_dict={x : [1000,122]})
print('vector:', vector.shape)
print(vector)

  param   0: embedding_layer/embeddings:0 (5000, 200)        float32_ref (mean: 8.860853995429352e-05, median: -7.748603820800781e-06, std: 0.05772264301776886)   
  num of params: 1000000
  layer   0: embedding_layer/embedding_lookup:0 <unknown>          float32
vector: (2, 200)
[[-0.0814091  -0.02825403 -0.05328007 -0.02780311 -0.0653923  -0.02430246
   0.01921391  0.09056108 -0.09929579 -0.04090867 -0.01154137  0.09985436
  -0.01916699 -0.00505853  0.04639273 -0.0912411   0.05427978 -0.09722042
  -0.00647257  0.08200861  0.02909157 -0.04913936 -0.02959277 -0.0851137
   0.00671704  0.09491635 -0.05910125  0.091788   -0.06434856  0.06943984
  -0.04222007  0.07136836 -0.01448791  0.08609817  0.00808146  0.03219343
   0.04411275  0.09080558 -0.03130624 -0.00718544  0.08492801  0.07755976
   0.00010915 -0.04611282 -0.00340497  0.07246631  0.03042834 -0.06478818
  -0.01676147 -0.09139509 -0.01971438  0.09120715 -0.07756324 -0.06961818
   0.03664833  0.02337506  0.09040568  0.00635266 -0.0

In [9]:
embed_pretrained = np.array([np.array([i for _ in range(200)]) for i in range(5000)])
tl.files.assign_params(sess, [embed_pretrained], emb_net)

[<tf.Tensor 'Assign:0' shape=(5000, 200) dtype=float32_ref>]

In [10]:
vector = sess.run(emb_net.outputs, feed_dict={x : [1000,122]})
print('vector:', vector.shape)
print(vector)

vector: (2, 200)
[[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.
  1000. 1000. 1000. 1