In [4]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, GRU, Bidirectional, Dropout, TimeDistributed
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

# Assuming preproc_english_sentences and preproc_french_sentences are already defined
# Load Data (this should be adapted to your specific data loading logic)
def load_data(path):
    with open(path, "r") as f:
        data = f.read()
    return data.split('\n')

english_sentences = load_data('data/english')
french_sentences = load_data('data/french')

# Tokenize the sentences
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']

text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

preproc_english_sentences, english_tokenizer = tokenize(english_sentences)
preproc_french_sentences, french_tokenizer = tokenize(french_sentences)

# Pad the sequences
max_french_sequence_length = max([len(sentence) for sentence in preproc_french_sentences])
tmp_x = pad_sequences(preproc_english_sentences, maxlen=max_french_sequence_length, padding='post')
preproc_french_sentences = pad_sequences(preproc_french_sentences, maxlen=max_french_sequence_length, padding='post')

#pad
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))
# Reshape input
tmp_x = tmp_x.reshape((-1, max_french_sequence_length, 1))

# Check the shapes of the inputs and targets
print(f'tmp_x shape: {tmp_x.shape}')
print(f'preproc_french_sentences shape: {preproc_french_sentences.shape}')

# Vocabulary sizes
english_vocab_size = len(english_tokenizer.word_index) + 1
french_vocab_size = len(french_tokenizer.word_index) + 1

# Define the model
def simple_model(input_shape, english_vocab_size, french_vocab_size):
    # Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

# Create the model
simple_rnn_model = simple_model(tmp_x.shape, english_vocab_size, french_vocab_size)

# Train the model
simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

# Print the model summary
print(simple_rnn_model.summary())





{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]
Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]
tmp_x shape: (137861, 21, 1)
preproc_french_sentences shape: (137861, 21)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoc

In [2]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [3]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
new jersey est parfois calme pendant l' automne et il est neigeux avril avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [4]:
# Define the bidirectional model
def bd_model(input_shape, english_vocab_size, french_vocab_size):
    # Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences=True), input_shape=input_shape[1:]))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile the model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

# Create the model
bd_rnn_model = bd_model(tmp_x.shape, english_vocab_size, french_vocab_size)

# Train the model
bd_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

# Print the model summary
print(bd_rnn_model.summary())

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_1 (Bidirectio  (None, 21, 256)          100608    
 nal)                                                            
                                                                 
 time_distributed_2 (TimeDis  (None, 21, 1024)         263168    
 tributed)                                                       
                                                                 
 dropout_1 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_3 (TimeDis  (None, 21, 345)          353625    
 tributed)                                                       
                                                                 
Total par

In [7]:
# Define the bidirectional model with embedding
def bidirectional_embed_model(input_shape, english_vocab_size, french_vocab_size):
    # Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

# Build the model
embed_rnn_model = bidirectional_embed_model(
    tmp_x.shape,
    english_vocab_size,
    french_vocab_size)

# Print the model summary
print(embed_rnn_model.summary())

# Train the model
embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 21, 256)           51200     
                                                                 
 bidirectional_2 (Bidirectio  (None, 21, 512)          789504    
 nal)                                                            
                                                                 
 time_distributed_4 (TimeDis  (None, 21, 1024)         525312    
 tributed)                                                       
                                                                 
 dropout_5 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_5 (TimeDis  (None, 21, 345)          353625    
 tributed)                                                       
                                                      

<keras.callbacks.History at 0x240bab606d0>

In [9]:
import json
# Save the model and related data
embed_rnn_model.save('english_to_french_model')

# Serialize English Tokenizer to JSON
with open('english_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(english_tokenizer.to_json(), ensure_ascii=False))

# Serialize French Tokenizer to JSON
with open('french_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(french_tokenizer.to_json(), ensure_ascii=False))

# Save max lengths
max_french_sequence_length_json = max_french_sequence_length
with open('sequence_length.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(max_french_sequence_length_json, ensure_ascii=False))



INFO:tensorflow:Assets written to: english_to_french_model\assets


INFO:tensorflow:Assets written to: english_to_french_model\assets


In [None]:
#BEAM SEARCH CODING part

In [5]:
import tensorflow as tf
import numpy as np
import json
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Load the transformer model
transformer = tf.keras.models.load_model('transformer_model')

# Load the tokenization configuration and vocabulary for English and Spanish
with open('eng_vectorization_config.json', 'r', encoding='utf-8') as f:
    eng_vectorization_config = json.load(f)
with open('eng_vocab.json', 'r', encoding='utf-8') as f:
    eng_vocab = json.load(f)

with open('spa_vectorization_config.json', 'r', encoding='utf-8') as f:
    spa_vectorization_config = json.load(f)
with open('spa_vocab.json', 'r', encoding='utf-8') as f:
    spa_vocab = json.load(f)

# Initialize tokenizers
eng_tokenizer = Tokenizer()
eng_tokenizer.word_index = {word: index for index, word in enumerate(eng_vocab)}

spa_tokenizer = Tokenizer()
spa_tokenizer.word_index = {word: index for index, word in enumerate(spa_vocab)}

# Define the function for vectorizing English input sentences
def eng_vectorization(sentences):
    sequences = eng_tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=eng_vectorization_config['max_len'], padding='post')

# Define BeamSearchDecoder class
class BeamSearchDecoder:
    def __init__(self, model, tokenizer, beam_width, max_seq_len):
        self.model = model
        self.tokenizer = tokenizer
        self.beam_width = beam_width
        self.max_seq_len = max_seq_len
        self.start_token = tokenizer.word_index['[start]']
        self.end_token = tokenizer.word_index['[end]']
        self.vocab_size = len(tokenizer.word_index)

    def decode(self, input_seq):
        # Initialize the beam with the start token
        beam = [(input_seq, [self.start_token], 0)]
        
        # Loop until the beam reaches the maximum sequence length
        for _ in range(self.max_seq_len):
            new_beam = []
            for input_seq, output_seq, score in beam:
                # Expand the current output sequence
                output_seq_padded = pad_sequences([output_seq], maxlen=self.max_seq_len, padding='post')
                predictions = self.model.predict([input_seq, output_seq_padded], verbose=0)
                
                # Get the top-k predictions
                top_k_indices = np.argsort(predictions[0, len(output_seq)-1, :])[-self.beam_width:]
                for index in top_k_indices:
                    new_score = score + np.log(predictions[0, len(output_seq)-1, index])
                    new_seq = output_seq + [index]
                    new_beam.append((input_seq, new_seq, new_score))
            
            # Keep the best beam_width sequences
            beam = sorted(new_beam, key=lambda x: x[2], reverse=True)[:self.beam_width]
            
            # If the end token is reached, break
            if all(seq[-1] == self.end_token for _, seq, _ in beam):
                break

        # Return the best sequence
        return beam[0][1]

def sequence_to_text(sequence, tokenizer):
    index_to_word = {index: word for word, index in tokenizer.word_index.items()}
    return ' '.join([index_to_word.get(index, '') for index in sequence if index != 0 and index != tokenizer.word_index.get('[start]', -1) and index != tokenizer.word_index.get('[end]', -1)])

# Translate function using beam search
def translate_sentence(sentence, beam_width=3, max_seq_len=20):
    input_seq = eng_vectorization([sentence])
    beam_search_decoder = BeamSearchDecoder(transformer, spa_tokenizer, beam_width, max_seq_len)
    decoded_sequence = beam_search_decoder.decode(input_seq)
    return sequence_to_text(decoded_sequence, spa_tokenizer)

