In [1]:
import tensorflow as tf
import numpy as np

VOCABS = ['<PAD>', 'apple', 'banana', 'orange', 'kiwi', '<EOS>', '<UNK>']
VOCABS_SIZE = len(VOCABS)
VOCABS_LOOKUP = dict(zip(VOCABS,range(3+len(VOCABS))))
MAX_SENTENCE_LEN = 20

  from ._conv import register_converters as _register_converters


In [2]:
def sentence_to_padded_index(sentence, max_sentence_len=MAX_SENTENCE_LEN):
    tokens = sentence.split()
    unknown_index = VOCABS_LOOKUP.get('<UNK>')
    padding_index = VOCABS_LOOKUP.get('<PAD>')
    sentence_ids = [VOCABS_LOOKUP.get(x, unknown_index) for x in tokens]
    sentence_ids = sentence_ids[:max_sentence_len]
    sentence_sequence = [padding_index]*max_sentence_len
    sentence_sequence[-len(sentence_ids):] = sentence_ids
    return sentence_sequence

my_sentence1 = 'apple kiwi orange lksdhf <EOS> banana apple pfskak <EOS>'
print(sentence_to_padded_index(my_sentence1))
my_sentence2 = 'kiwi'
print(sentence_to_padded_index(my_sentence2))
my_sentence3 = 'orange '*100
print(sentence_to_padded_index(my_sentence3))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 3, 6, 5, 2, 1, 6, 5]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4]
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


In [3]:
with tf.name_scope("input_layer"):
    input_x = tf.placeholder(tf.int32, [None, MAX_SENTENCE_LEN], name='input_x')

In [4]:
with tf.device('/cpu:0'), tf.name_scope("embedding"):
    embed_weights = tf.Variable(tf.eye(VOCABS_SIZE), name='one_hot_embedding_weights')
    embedding = tf.nn.embedding_lookup(embed_weights, input_x)

In [5]:
corpus = ['banana apple <EOS>',
          'orange lkasjd <EOS>',
          'lkasjd kiwi kiwi ldksj apple <EOS>']
corpus_index = [sentence_to_padded_index(x) for x in corpus]


In [6]:
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    out = sess.run(embedding, feed_dict={input_x: corpus_index})
    print(out)
    print("corpus encoded to shape (`batch_size`, `max_sequence_len`, `embedding_size`):\n", out.shape)

[[[1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0.]]

 [[1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1. 0.