In [None]:
import collections
import numpy as np
import pickle
import project_tests as tests
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [None]:
with open('/home/hp/en.pkl', 'rb') as f:
    english_sentences = pickle.load(f)
with open('/home/hp/hi.pkl', 'rb') as f:
    hindi_sentences = pickle.load(f)

In [None]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, hindi_sentences[sample_i]))

In [None]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
hindi_words_counter = collections.Counter([word for sentence in hindi_sentences for word in sentence.split()])
print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} Hindi words.'.format(len([word for sentence in hindi_sentences for word in sentence.split()])))
print('{} unique Hindi words.'.format(len(hindi_words_counter)))
print('10 Most common words in the Hindi dataset:')
print('"' + '" "'.join(list(zip(*hindi_words_counter.most_common(10)))[0]) + '"')

In [None]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    x_tk = Tokenizer()
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

In [None]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    if length is None:
        # Find the length of the longest sequence/sentence
        length = max([len(seq) for seq in x])
    
    return pad_sequences(sequences=x, maxlen=length, padding='post')

In [None]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_hindi_sentences, english_tokenizer, hindi_tokenizer =\
    preprocess(english_sentences, hindi_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_hindi_sequence_length = preproc_hindi_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
hindi_vocab_size = len(hindi_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max Hindi sentence length:", max_hindi_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("Hindi vocabulary size:", hindi_vocab_size)

In [None]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

In [None]:
from keras.callbacks import TensorBoard
from time import time
tensorboard = TensorBoard(log_dir="logs/{}".format(time()), histogram_freq=1, write_graph=True)

In [None]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param hindi_vocab_size: Number of unique Hindi words in the dataset
    :return: Keras model built, but not trained
    """    
    input_seq = Input(shape=input_shape[1:])
    rnn = GRU(units=english_vocab_size+1, return_sequences=True)(input_seq)
    logits = TimeDistributed(Dense(units=hindi_vocab_size+1))(rnn) 
                             
    model = Model(input_seq, Activation('softmax')(logits))

    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(lr=1e-3),
                  metrics=['accuracy'])
    return model

tmp_x = pad(preproc_english_sentences, max_hindi_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_hindi_sentences.shape[-2], 1))

simple_rnn_model = simple_model(
    tmp_x.shape,
    max_hindi_sequence_length,
    english_vocab_size,
    hindi_vocab_size)
simple_rnn_model.fit(tmp_x, preproc_hindi_sentences, batch_size=1024, epochs=10, validation_split=0.2)
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], hindi_tokenizer))

In [None]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):
    """
    Build and train a RNN model using word embedding on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param hindi_vocab_size: Number of unique Hindi words in the dataset
    :return: Keras model built, but not trained
    """
    
    embedding_size = 128
    rnn_cells = 200
    dropout = 0.0
    learning_rate = 1e-3
    
    input_seq = Input(shape=input_shape[1:])
     
    embedded_seq = Embedding(input_dim = english_vocab_size+1, 
                             output_dim = embedding_size,
                             input_length=input_shape[1:][0])(input_seq)
    
    rnn = GRU(units=rnn_cells, dropout=dropout, return_sequences=True)(embedded_seq)
    logits = TimeDistributed(Dense(units=hindi_vocab_size+1))(rnn) 
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(lr=learning_rate),
                  metrics=['accuracy'])
    return model    
    

# Pad the input to work with the Embedding layer
tmp_x = pad(preproc_english_sentences, max_hindi_sequence_length)


embed_rnn_model = embed_model(input_shape = tmp_x.shape,
                              output_sequence_length = max_hindi_sequence_length,
                              english_vocab_size = english_vocab_size,
                              hindi_vocab_size = hindi_vocab_size)


embed_rnn_model.fit(tmp_x, preproc_hindi_sentences, batch_size=1024, epochs=10, validation_split=0.2)
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], hindi_tokenizer))

In [None]:
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], hindi_tokenizer))

In [None]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):
    """
    Build and train a bidirectional RNN model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param hindi_vocab_size: Number of unique Hindi words in the dataset
    :return: Keras model built, but not trained
    """

    dropout = 0.0
    learning_rate = 1e-3
    
    impl='seq'   
    if impl=='func':
        # Sequential Model 
        print("Using Sequential model (Note: this version makes the unitary test to fail: Disable tests to use it)")
        from keras.models import Sequential
        model = Sequential()
        model.add(Bidirectional(GRU(english_vocab_size+1, dropout=dropout, return_sequences=True)))
        model.add(Dense(hindi_vocab_size+1, activation='softmax'))
        
    else:
        print("Using Functional API")
        from keras.layers import concatenate, add
        input_seq = Input(shape=input_shape[1:])
        right_rnn = GRU(units=english_vocab_size+1, return_sequences=True, go_backwards=False)(input_seq)
        left_rnn = GRU(units=english_vocab_size+1, return_sequences=True, go_backwards=True)(input_seq)

        logits = TimeDistributed(Dense(units=hindi_vocab_size+1))(concatenate([right_rnn, left_rnn])) 
        
        model = Model(input_seq, Activation('softmax')(logits))

    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(lr=learning_rate),
                  metrics=['accuracy'])
  
    return model
   

tmp_x = pad(preproc_english_sentences, max_hindi_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_hindi_sentences.shape[-2], 1))

bd_rnn_model = bd_model(input_shape = tmp_x.shape,
                           output_sequence_length = max_hindi_sequence_length,
                           english_vocab_size = english_vocab_size,
                           hindi_vocab_size = hindi_vocab_size)

bd_rnn_model.fit(tmp_x, preproc_hindi_sentences, batch_size=1024, epochs=10, validation_split=0.2)

print(logits_to_text(bd_rnn_model.predict(tmp_x[:1])[0], hindi_tokenizer))

In [None]:
def encdec_model(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):
    """
    Build and train an encoder-decoder model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param hindi_vocab_size: Number of unique Hindi words in the dataset
    :return: Keras model built, but not trained
    """

    
    # Hyperparameters
    embedding_size = 128
    rnn_cells = 200
    dropout = 0.0
    learning_rate = 1e-3
    
    from keras.layers import LSTM
    
    # Input
    encoder_input_seq = Input(shape=input_shape[1:], name="enc_input")
 
    # Encoder (Return the internal states of the RNN -> 1 hidden state for GRU cells, 2 hidden states for LSTM cells))
    encoder_output, state_t = GRU(units=rnn_cells, 
                                  dropout=dropout,
                                  return_sequences=False,
                                  return_state=True,
                                  name="enc_rnn")(encoder_input_seq)
          #or for LSTM cells: encoder_output, state_h, state_c = LSTM(...)
        
    # Decoder Input   
    decoder_input_seq = RepeatVector(output_sequence_length)(encoder_output)

    # Decoder RNN (Take the encoder returned states as initial states)
    decoder_out = GRU(units=rnn_cells,
                      dropout=dropout,
                      return_sequences=True,
                      return_state=False)(decoder_input_seq, initial_state=state_t)
                                         #or for LSTM cells: (decoder_input_seq, initial_state=[state_h, state_c])
    
    # Decoder output 
    logits = TimeDistributed(Dense(units=hindi_vocab_size))(decoder_out) 
    
    # Model
    model = Model(encoder_input_seq, Activation('softmax')(logits))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(lr=learning_rate),
                  metrics=['accuracy'])
     
    return model    
    

# OPTIONAL: Train and Print prediction(s)

# Pad and Reshape the input to work with the Embedding layer
tmp_x = pad(preproc_english_sentences, max_hindi_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_hindi_sentences.shape[-2], 1))
#print("Debug tmp_x shape=", tmp_x.shape )

# Train the neural network 
encdec_rnn_model = encdec_model(input_shape = tmp_x.shape,
                                output_sequence_length = max_hindi_sequence_length,
                                english_vocab_size = english_vocab_size+1,
                                hindi_vocab_size = hindi_vocab_size+1)
    
#print(encdec_rnn_model.summary())

encdec_rnn_model.fit(tmp_x, preproc_hindi_sentences, batch_size=1024, epochs=10, validation_split=0.2) # callbacks=[tensorboard]

# Print prediction(s)
print(logits_to_text(encdec_rnn_model.predict(tmp_x[:1])[0], hindi_tokenizer))
    
 

In [None]:
def model_final(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param hindi_vocab_size: Number of unique Hindi words in the dataset
    :return: Keras model built, but not trained
    """
    
       
    # Hyperparameters
    embedding_size = 128
    rnn_cells = 300
    dropout = 0.2
    learning_rate = 1e-3
    
    from keras.layers import LSTM, concatenate
    
    # Input and embedding
    encoder_input_seq = Input(shape=input_shape[1:]) 
    embedded_input_seq = Embedding(input_dim = english_vocab_size+1,
                                   output_dim = embedding_size,
                                   input_length=input_shape[1:][0])(encoder_input_seq)
    
  
    encoder_forward_output, forward_state_h, forward_state_c = LSTM(units=rnn_cells,
                                                                    dropout=dropout,
                                                                    return_sequences=False,
                                                                    return_state=True,
                                                                    go_backwards=False)(embedded_input_seq)
    
    encoder_backward_output, backward_state_h, backward_state_c = LSTM(units=rnn_cells,
                                                                       dropout=dropout,
                                                                       return_sequences=False,
                                                                       return_state=True,
                                                                       go_backwards=True)(embedded_input_seq)
    
    state_h = concatenate([forward_state_h, backward_state_h]) 
    state_c = concatenate([forward_state_c, backward_state_c])    
    encoder_output = concatenate([encoder_forward_output, encoder_backward_output])        
         
    decoder_input_seq = RepeatVector(output_sequence_length)(encoder_output)
    decoder_output = LSTM(units=rnn_cells*2,
                                  dropout=dropout,
                                  return_sequences=True,
                                  return_state=False,
                                  go_backwards=False)(decoder_input_seq, initial_state=[state_h, state_c])
    
    logits = TimeDistributed(Dense(units=hindi_vocab_size+1))(decoder_output) 
    
    # Model
    model = Model(encoder_input_seq, Activation('softmax')(logits))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(lr=learning_rate),
                  metrics=['accuracy'])
    
    return model    
       
print('Final Model Loaded\n')
tmp_x = pad(preproc_english_sentences, max_hindi_sequence_length)
final_rnn_model = model_final(input_shape = tmp_x.shape,
                              output_sequence_length = max_hindi_sequence_length,
                              english_vocab_size = english_vocab_size,
                              hindi_vocab_size = hindi_vocab_size)

print(final_rnn_model.summary(line_length=125))


final_rnn_model.fit(tmp_x, preproc_hindi_sentences, batch_size=1024, epochs=10, validation_split=0.2)

print(logits_to_text(final_rnn_model.predict(tmp_x[:1])[0], hindi_tokenizer))
    

In [None]:
def final_predictions(x, y, x_tk, y_tk):
    """
    Gets predictions using the final model
    :param x: Preprocessed English data
    :param y: Preprocessed Hindi data
    :param x_tk: English tokenizer
    :param y_tk: Hindi tokenizer
    """
    # TODO: Train neural network using model_final
    model = model_final(input_shape = x.shape,
                        output_sequence_length = y.shape[1],
                        english_vocab_size = len(x_tk.word_index),
                        hindi_vocab_size = len(y_tk.word_index))

    model.fit(x, y, batch_size=1024, epochs=20, validation_split=0.2)
    
    ## DON'T EDIT ANYTHING BELOW THIS LINE
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'

    sentence = 'he saw an old truck'
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictions = model.predict(sentences, len(sentences))

    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))



In [None]:
final_predictions(preproc_english_sentences, preproc_hindi_sentences, english_tokenizer, hindi_tokenizer)