In [1]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import os

batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = '../data/fra.txt'
MAX_SEQ_LEN = 250
MAX_NUM_WORDS = 10000
EMBEDDING_DIM = 100

BASE_DATA_DIR = os.path.join("../../", "data")
BASIC_PERSISTENT_DIR = '/persistent/gpu2/'
GRAPH_DIR = 'graph_stack2/'
MODEL_DIR = 'model_stack2/'
MODEL_CHECKPOINT_DIR = 'model_chkp_stack2/'


# os.environ["CUDA_VISIBLE_DEVICES"] = ""

def load(file):
    """
    Loads the given file into a list.
    :param file: the file which should be loaded
    :return: list of data
    """
    with(open(file, encoding='utf8')) as file:
        data = file.readlines()
        # data = []
        # for i in range(MAX_SENTENCES):
        #    data.append(lines[i])
    print('Loaded', len(data), "lines of data.")
    return data


def preprocess_data(train_input_data, train_target_data, val_input_data, val_target_data):
    train_input_data, train_target_data, val_input_data, val_target_data, word_index = tokenize(train_input_data,
                                                                                                train_target_data,
                                                                                                val_input_data,
                                                                                                val_target_data)

    train_input_data = pad_sequences(train_input_data, maxlen=MAX_SEQ_LEN, padding='post')
    train_target_data = pad_sequences(train_target_data, maxlen=MAX_SEQ_LEN, padding='post')
    val_input_data = pad_sequences(val_input_data, maxlen=MAX_SEQ_LEN, padding='post')
    val_target_data = pad_sequences(val_target_data, maxlen=MAX_SEQ_LEN, padding='post')

    embeddings_index = load_embedding()
    embedding_matrix, num_words = prepare_embedding_matrix(word_index, embeddings_index)

    # target_data = convert_last_dim_to_one_hot_enc(padded_target_data, num_words)

    return train_input_data, train_target_data, val_input_data, val_target_data, embedding_matrix, num_words


def tokenize(train_input_data, train_target_data, val_input_data, val_target_data):
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(train_input_data + train_target_data + val_input_data + val_target_data)

    train_input_data = tokenizer.texts_to_sequences(train_input_data)
    train_target_data = tokenizer.texts_to_sequences(train_target_data)
    val_input_data = tokenizer.texts_to_sequences(val_input_data)
    val_target_data = tokenizer.texts_to_sequences(val_target_data)

    return train_input_data, train_target_data, val_input_data, val_target_data, tokenizer.word_index


def load_embedding():
    print('Indexing word vectors.')

    embeddings_index = {}
    filename = os.path.join(BASE_DATA_DIR, 'glove.6B.100d.txt')
    with open(filename, 'r', encoding='utf8') as f:
        for line in f.readlines():
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    print('Found %s word vectors.' % len(embeddings_index))

    return embeddings_index


def prepare_embedding_matrix(word_index, embeddings_index):
    print('Preparing embedding matrix.')

    # prepare embedding matrix
    num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return embedding_matrix, num_words


Using TensorFlow backend.


In [5]:


TRAIN_EN_FILE = "train.en"
TRAIN_DE_FILE = "train.de"
VAL_EN_FILE = "newstest2014.en"
VAL_DE_FILE = "newstest2014.de"

english_train_file = os.path.join(BASE_DATA_DIR, TRAIN_EN_FILE)
german_train_file = os.path.join(BASE_DATA_DIR, TRAIN_DE_FILE)
english_val_file = os.path.join(BASE_DATA_DIR, VAL_EN_FILE)
german_val_file = os.path.join(BASE_DATA_DIR, VAL_DE_FILE)
train_input_data = load(english_train_file)
train_target_data = load(german_train_file)


Loaded 99999 lines of data.
Loaded 99999 lines of data.


In [7]:
val_input_data = load(english_val_file)
val_target_data = load(german_val_file)

train_input_data, train_target_data, val_input_data, val_target_data, embedding_matrix, num_words = preprocess_data(
    train_input_data, train_target_data, val_input_data, val_target_data)



Loaded 2737 lines of data.
Loaded 2737 lines of data.
Indexing word vectors.
Found 400000 word vectors.
Preparing embedding matrix.


In [28]:
def get_decoder_target_data(decoder_input_data):
    print("decoder_input_data.shape", decoder_input_data.shape)
    decoder_target_data = np.zeros(decoder_input_data.shape)
    print("decoder_target_data.shape", decoder_target_data.shape)
    for i in range(decoder_input_data.shape[0]):
        for j in range(decoder_input_data.shape[1]-1):
            decoder_target_data[i][j+1] = decoder_input_data[i][j]
        decoder_target_data[i][0] = 0
    return decoder_target_data

In [29]:
train_decoder_input_data = get_decoder_target_data(train_target_data)

decoder_input_data.shape (99999, 250)
decoder_target_data.shape (99999, 250)


In [30]:
train_decoder_input_data.shape

(99999, 250)

In [31]:
a = np.zeros((2,3))
a[0][0] = 1
a[1][1] = 2
a[1][2] = 3
a

array([[ 1.,  0.,  0.],
       [ 0.,  2.,  3.]])

In [32]:
get_decoder_target_data(a)

decoder_input_data.shape (2, 3)
decoder_target_data.shape (2, 3)


array([[ 0.,  1.,  0.],
       [ 0.,  0.,  2.]])