In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def load_data(filename):
    """
    Load dataset
    """
    return pd.read_csv(filename, delimiter="\n", names=['text'])

english_full = load_data("data/small_vocab_en")
french_full = load_data("data/small_vocab_fr")

In [2]:
print(english_full.iloc[541:553,])
print(french_full.iloc[541:553,])
len(english_full)
len(french_full)

                                                  text
541  his most loved fruit is the lime , but her mos...
542  the grapefruit is your least liked fruit , but...
543            she was driving that shiny blue truck .
544  paris is never freezing during july , but it i...
545  california is sometimes cold during autumn , b...
546  california is sometimes warm during february ,...
547  paris is usually busy during june , and it is ...
548  the united states is sometimes nice during jul...
549  china is cold during december , and it is neve...
550  india is never dry during spring , but it is u...
551  the strawberry is our least liked fruit , but ...
552  the lemon is my least favorite fruit , but the...
                                                  text
541  son fruit le plus aimé est la chaux , mais son...
542  le pamplemousse est votre fruit moins aimé , m...
543          elle conduisait ce camion bleu brillant .
544  paris est jamais le gel en juillet , mais il e...
545  calif

137860

In [3]:
english, english_test, french, french_test = train_test_split(english_full, french_full, test_size = 0.1)

In [4]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import contractions
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin


def preprocess_text(corpus, language):
    corpus = corpus['text'].str.lower()
    corpus = corpus.map(lambda x: x.translate(str.maketrans('', '', punctuation)))
    if language == 'english':
        corpus = corpus.map(lambda x: contractions.fix(x))
    corpus = corpus.map(word_tokenize)
    return corpus

en_preprocessed = preprocess_text(english, 'english')
fr_preprocessed = preprocess_text(french, 'french')

en_preprocessed.head()

59175           [she, dislikes, apples, and, strawberries]
22551           [she, likes, bananas, limes, and, oranges]
62827    [the, united, states, is, freezing, during, se...
8733     [california, is, snowy, during, february, and,...
1244     [new, jersey, is, never, pleasant, during, sum...
Name: text, dtype: object

In [5]:
from itertools import count
from collections import defaultdict
from keras.preprocessing.sequence import pad_sequences


def word_to_id(token_list):
    '''creates a dictionary mapping each unique word to a unique integer id'''
    id_dict = defaultdict((count().__next__))
    for token in token_list:
        id_dict[token] = id_dict[token]
    return id_dict


def tokens_to_sequence(tokens):
    '''takes a dataframe with a "text" column and maps the text column to integers'''
    tokenlist = []
    for token in tokens:
        tokenlist.extend(token)
    id_dict = word_to_id(list(set(tokenlist)))
    sequence_list = tokens.map(lambda x: [id_dict[w] + 1 for w in x])
    return sequence_list, id_dict


def test_token_update(tokens, id_dict):
    '''takes a dataframe with a "text" column and a dictionary mapping words to ids. 
    Maps the text column to integers based on said dictionary'''
    sequence_list = tokens.map(lambda x: [id_dict[w] + 1 if w in id_dict.keys() else 0 for w in x])
    return sequence_list


def pad_sequence(sequence):
    '''takes an array of text and pads the sequence with 0s so all inputs are the same length'''
    return pad_sequences(sequence, len(max(sequence, key=len)), 'int', 'post', 'post', 0)


def pad_test_sequence(sequence, length = None):
    '''takes an array of text and a padded sequence, pads the array of text to the same length as the padded sequence'''
    if length:
        return pad_sequences(sequence, length, 'int', 'post', 'post', 0)
    return pad_sequences(sequence, len(max(train_sequence, key=len)), 'int', 'post', 'post', 0)


en_sequence, en_id_dict = tokens_to_sequence(en_preprocessed)
fr_sequence, fr_id_dict = tokens_to_sequence(fr_preprocessed)

en_padded = pad_sequence(en_sequence)
fr_padded = pad_sequence(fr_sequence)

en_len = en_padded.shape[1]
fr_len = fr_padded.shape[1]
en_vocab = max(en_id_dict.values()) + 1
fr_vocab = max(fr_id_dict.values()) + 1

Using TensorFlow backend.


In [6]:
def output_to_text(output, id_dict):
    idtt = {v + 1:k for k, v in id_dict.items()}
    idtt[0] = '<EMPTY>'
    idtt[fr_vocab+1] = '<START>'
    
    return ' '.join(idtt[word] for word in np.argmax(output, 1))

In [7]:
print(en_len, fr_len, en_vocab, fr_vocab)

15 21 197 343


## Baseline RNN

In [8]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, GRU, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

fr_padded_3d = fr_padded.reshape(*fr_padded.shape, 1)

tmp_x = pad_test_sequence(en_padded, fr_padded_3d.shape[1])
tmp_x = tmp_x.reshape((-1, fr_padded_3d.shape[-2]))


def compile_embedding_RNN(input_length, input_vocab_size, output_vocab_size, bidirectional = True, embed_size=200, 
                          lstm_size=256, lstm_dropout=.2, dense_size=1024, dense_dropout=.5, learning_rate=.001):
    '''compiles an RNN model.
    Input Length: The length of the input sequences
    Input vocab size: The size of the vocabulary for the input data
    Output vocab size: The size of the vocabulary for the labels
    Bidirectional: Should the RNN use a bidirectional layer?
    Embed size: the size of the embedding layer
    LSTM size: the size of the LSTM layer
    LSTM dropout: The amount of dropout to apply to the LSTM layer. Must be a number between 0 and 1.
    Dense size: the size of the Dense layer
    Dense dropout: The amount of dropout to apply to the dense layer. Must be an umber between 0 and 1.
    Learning rate: A small number, usually around .001 or .01'''
    
    model = Sequential()
    
    model.add(Embedding(input_vocab_size, embed_size, input_length=input_length,  input_shape=(input_length, )))
    if bidirectional:
        model.add(Bidirectional(LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', dropout=lstm_dropout, 
                       return_sequences = True)))
    else:
        model.add(LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', dropout=lstm_dropout, 
                       return_sequences = True))
    model.add(TimeDistributed(Dense(dense_size, activation='relu')))
    model.add(Dropout(dense_dropout))
    model.add(TimeDistributed(Dense(output_vocab_size, activation='softmax')))

    model.compile(loss = sparse_categorical_crossentropy, optimizer=Adam(learning_rate), metrics=['accuracy'])

    return model

embedding_size = 250
lstm_size = 256
lstm_dropout = .5
dense_size = 1024
dense_dropout = .5
learning_rate = .003

model = compile_embedding_RNN(fr_len, en_vocab+1, fr_vocab+1, True, embedding_size, lstm_size, lstm_dropout, dense_size, 
                              dense_dropout, learning_rate)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 21, 250)           49500     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 21, 512)           1038336   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 21, 1024)          525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, 21, 1024)          0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 21, 344)           352600    
Total params: 1,965,748
Trainable params: 1,965,748
Non-trainable params: 0
_________________________________________________________________


In [27]:
history = model.fit(x = tmp_x, y = fr_padded_3d, epochs = 5, batch_size = 250, verbose = 1, validation_split = .2)

Train on 99259 samples, validate on 24815 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
output_to_text(model.predict(tmp_x[:1])[0], fr_id_dict)

model.save('baseline.h5')

## Unconditioned Encoder Decoder Model

In [9]:
from keras.layers import RepeatVector

def compile_enc_dec_RNN(input_length, input_vocab_size, output_vocab_size, embed_size=200, 
                          lstm_size=256, lstm_dropout=.2, dense_size=1024, dense_dropout=.5, learning_rate=.001):
    
    model = Sequential()
    
    model.add(Embedding(input_vocab_size, embed_size, input_length=input_length,  input_shape=(input_length, )))
    model.add(Bidirectional(LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', dropout=lstm_dropout)))
    model.add(RepeatVector(fr_len))
    model.add(Bidirectional(LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', dropout=lstm_dropout, 
                                 return_sequences = True)))
    model.add(TimeDistributed(Dense(dense_size, activation='relu')))
    model.add(Dropout(dense_dropout))
    model.add(TimeDistributed(Dense(output_vocab_size, activation='softmax')))

    model.compile(loss = sparse_categorical_crossentropy, optimizer=Adam(learning_rate), metrics=['accuracy'])

    return model

In [10]:
model2 = compile_enc_dec_RNN(en_len, en_vocab+1, fr_vocab+1, learning_rate = .003)
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 15, 200)           39600     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               935936    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 21, 512)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 21, 512)           1574912   
_________________________________________________________________
time_distributed_3 (TimeDist (None, 21, 1024)          525312    
_________________________________________________________________
dropout_2 (Dropout)          (None, 21, 1024)          0         
_________________________________________________________________
time_distributed_4 (TimeDist (None, 21, 344)          

In [11]:
model2.fit(en_padded, fr_padded_3d, epochs = 5, batch_size = 64, verbose = 1, validation_split = .2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 99259 samples, validate on 24815 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1aeb1a099b0>

In [12]:
output_to_text(model2.predict(en_padded[:1])[0], fr_id_dict)
model2.save('unconditioned_encoder_decoder.h5')

## Conditioned Encoder Decoder Model

In [10]:
decoder_input_data = np.zeros((en_sequence.shape[0], fr_len), dtype = 'uint16')
fr_padded_3d = np.zeros((en_sequence.shape[0], fr_len, fr_vocab+2), dtype = 'uint16')
for i, sequence in enumerate(fr_padded):
    for j, word in enumerate(sequence):
        fr_padded_3d[i, j, word] = 1
        if j == 0:
            decoder_input_data[i, j] = fr_vocab + 1
        else:
            decoder_input_data[i, j] = fr_padded[i, j-1]
            
en_padded = pad_sequences(en_padded, en_len + 1, padding = 'pre', value = en_vocab+1)



In [11]:
fr_padded_3d.shape

(124074, 21, 346)

In [12]:
from keras.models import Model
from keras.layers import Input

In [13]:
def define_models(n_input, n_output, vec_len, n_units):
    '''
    n_input: Number of first language words
    n_output: Number of second language words
    vec_len: Length of embedding vectors
    n_units: Number of hidden units
    '''
    encoder_inputs = Input(shape = (None,))
    enc_embed = Embedding(n_input, vec_len)
    encoder = LSTM(n_units, return_state = True)
    encoder_outputs, state_h, state_c = encoder(enc_embed(encoder_inputs))
    encoder_states = [state_h, state_c]
    
    decoder_inputs = Input(shape=(None,))
    dec_embed = Embedding(n_output, vec_len)
    decoder_lstm = LSTM(n_units, return_sequences = True, return_state = True)
    decoder_outputs, _, _ = decoder_lstm(dec_embed(decoder_inputs), initial_state = encoder_states)
    decoder_dense = Dense(n_output, activation = 'softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    encoder_model = Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(dec_embed(decoder_inputs), initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    return model, encoder_model, decoder_model

model3, enc_model, dec_model = define_models(en_vocab+1, fr_vocab+2, 100, 256)

In [14]:
model3.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics = ['accuracy'])
model3.fit([en_padded, decoder_input_data], fr_padded_3d,
          batch_size=500,
          epochs=20,
          validation_split=0.2)
model3.save('conditioned_encoder_decoder.h5')

Train on 99259 samples, validate on 24815 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [15]:
idtt = {v + 1:k for k, v in fr_id_dict.items()}
idtt[0] = '<EMPTY>'
idtt[fr_vocab + 1] = '<START>'
    
def decode_sequence(input_seq):
    '''Takes an input sequence and predicts the output sequence'''
    # Encode the input as state vectors.
    tokens = preprocess_text(input_seq, 'english')
    sequence = test_token_update(tokens, en_id_dict)
    encoder_input = pad_test_sequence(sequence, en_len)
    states_value = enc_model.predict(encoder_input)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_vocab + 1
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    decoded_df = []
    
    for i, sequence in enumerate(encoder_input):
        decoded_sentence = []
        for j in range(len(sequence)):
            output_tokens, h, c = dec_model.predict([target_seq] + states_value)
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word = idtt[sampled_token_index]
            decoded_sentence.append(sampled_word)
            target_seq = np.zeros((1,1))
            target_seq[0,0] = sampled_token_index
            states_value = [h,c]
        decoded_df.append(decoded_sentence)
        
    return decoded_df

decode_df = pd.DataFrame({"text": ["I like apples"]})
decode_sequence(decode_df)

[[ 55  92 111   0   0   0   0   0   0   0   0   0   0   0   0]]


[['jaime',
  'les',
  'pommes',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>',
  '<EMPTY>']]