In [1]:
import tensorflow as tf
import numpy as np
import codecs
import re as regex
import re
from tqdm import tqdm
import math

In [14]:
src_sents = [line for line in
                 codecs.open("test.txt", 'r', 'utf-8').read().split("\n") if line]

In [3]:
len(src_sents)

7

In [4]:
def load_vocab(path):
    vocab = [line.split()[0] for line in codecs.open(path, 'r', 'utf-8')]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for idx, word in enumerate(vocab)}
    return word2idx, idx2word

In [5]:
def load_source_vocab():
    return load_vocab("src.vocab.tsv")


def load_target_vocab():
    return load_vocab("tgt.vocab.tsv")

In [6]:
_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
def basic_tokenizer(sentence, lower=True):
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_fragment))
    return [w.lower() if lower else w for w in words if w != '' and w != ' ']

In [11]:
def create_test_data(source_sents):
    src2idx, idx2src = load_source_vocab()
    # Index
    x_list, sources = [], []
    for source_sent in tqdm(source_sents, desc="Preparing data: ", total=len(source_sents)):
        source_sent = basic_tokenizer(source_sent,lower=False)
        x = [src2idx.get(word, src2idx["<unk>"]) for word in source_sent]
        x_list.append(np.array(x))
        sources.append(source_sent)
        print(source_sent)

    max_infer_len = np.max([len(x) for x in x_list])
    X = np.zeros([len(x_list), max_infer_len], np.int32)
    actual_lengths = []
    for i, x in tqdm(enumerate(x_list), desc="Padding: ", total=len(x_list)):
        actual_lengths.append(len(x))
        X[i] = np.lib.pad(x, [0, max_infer_len - len(x)], 'constant', constant_values=(0, 0))

    return X, sources, actual_lengths

In [15]:
X, Sources, actual_lengths = create_test_data(src_sents)

Preparing data:   0%|                                                                            | 0/7 [00:00<?, ?it/s]

['Bo', 'phim', 'lan', 'dau', 'duoc', 'cong', 'chieu', 'tai', 'lien', 'hoan', 'phim', 'Rome', '2007', 'va', 'sau', 'do', 'duoc', 'chieu', 'o', 'Fairbanks', ',', 'Alaska', 'ngay', '21', 'thang', '9', 'nam', '2007', '.']
['Troi', 'mua', 'mua', 'ao', 'mua']
['me', 'bao', 'em', 'dam', 'dang']
['cái', 'nha', 'rung', 'lac', 'Vi', 'anh', 'em', 'ba', 'con', 'nhay', 'ram', 'ram', 'nhu', 'muon', 'sap']
['anh', 'ngu', 'chua', ',', 'sao', 'anh', 'khong', 'den', ',', 'em', 'om', '2', 'thang', 'nay', 'met', 'lam']
['em', 'boc', 'cut', 'lon', 'lam', 'gi', 'The']
['Hom', 'nay', ',', 'bao', 'cao', 'cua', 'Counterpoint', 'Research', 'cho', 'thay', ',', 'trong', 'nam', '2018', 'Apple', 'da', 'ban', 'duoc', 'khoang', '35', 'trieu', 'cap', 'tai', 'nghe', 'khong', 'day', 'AirPods', '.', 'Theo', 'hang', 'phan', 'tich', 'nay', ',', 'AirPods', 'hien', 'la', 'tai', 'nghe', 'khong', 'day', 'pho', 'bien', 'nhat', '.']


Preparing data: 100%|███████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 999.77it/s]
Padding: 100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<?, ?it/s]


In [24]:
basic_tokenizer(src_sents[0], lower=False)

['Bo',
 'phim',
 'lan',
 'dau',
 'duoc',
 'cong',
 'chieu',
 'tai',
 'lien',
 'hoan',
 'phim',
 'Rome',
 '2007',
 'va',
 'sau',
 'do',
 'duoc',
 'chieu',
 'o',
 'Fairbanks',
 ',',
 'Alaska',
 'ngay',
 '21',
 'thang',
 '9',
 'nam',
 '2007',
 '.']

In [17]:
Sources

[['Bo',
  'phim',
  'lan',
  'dau',
  'duoc',
  'cong',
  'chieu',
  'tai',
  'lien',
  'hoan',
  'phim',
  'Rome',
  '2007',
  'va',
  'sau',
  'do',
  'duoc',
  'chieu',
  'o',
  'Fairbanks',
  ',',
  'Alaska',
  'ngay',
  '21',
  'thang',
  '9',
  'nam',
  '2007',
  '.'],
 ['Troi', 'mua', 'mua', 'ao', 'mua'],
 ['me', 'bao', 'em', 'dam', 'dang'],
 ['cái',
  'nha',
  'rung',
  'lac',
  'Vi',
  'anh',
  'em',
  'ba',
  'con',
  'nhay',
  'ram',
  'ram',
  'nhu',
  'muon',
  'sap'],
 ['anh',
  'ngu',
  'chua',
  ',',
  'sao',
  'anh',
  'khong',
  'den',
  ',',
  'em',
  'om',
  '2',
  'thang',
  'nay',
  'met',
  'lam'],
 ['em', 'boc', 'cut', 'lon', 'lam', 'gi', 'The'],
 ['Hom',
  'nay',
  ',',
  'bao',
  'cao',
  'cua',
  'Counterpoint',
  'Research',
  'cho',
  'thay',
  ',',
  'trong',
  'nam',
  '2018',
  'Apple',
  'da',
  'ban',
  'duoc',
  'khoang',
  '35',
  'trieu',
  'cap',
  'tai',
  'nghe',
  'khong',
  'day',
  'AirPods',
  '.',
  'Theo',
  'hang',
  'phan',
  'tich',
  'n

In [31]:
X[6]

array([1525,   39,    6,   98,  111,  118,    1,    1,   78,   65,    6,
         60,   25,    1,    1,   97,   42,   13,  385,    1,  267,  215,
         16,  272,   93,  218,    1,    4,  592,  249,  191,  567,   39,
          6,    1,  213,   50,   16,  272,   93,  218,  679,  654,  287,
          4])

In [27]:
src2idx, idx2src = load_source_vocab()

In [36]:
src2idx["Hom"]

1525