In [1]:
import numpy as np
import re
import heapq
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [2]:
# 1. Text Cleaner
def text_cleaner(text):
    newString = text.lower()
    newString = re.sub(r"'s\b", "", newString)
    newString = re.sub("[^a-zA-Zñáéíóúü ]", " ", newString)
    newString = re.sub('\s+', ' ', newString)
    long_words = []
    for i in newString.split():
        if len(i) >= 3:
            long_words.append(i)
    return (" ".join(long_words)).strip()

with open('training_data.txt', 'r', encoding='utf-8') as file:
    training_data = file.read()

data_new = text_cleaner(training_data)

In [3]:
# 2. Tokenizing the text into words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data_new])
encoded = tokenizer.texts_to_sequences([data_new])[0]

In [4]:
# 3. Creating Sequences of Tokens
sequence_length = 30  # Length of the word sequences
sequences = [encoded[i - sequence_length:i+1] for i in range(sequence_length, len(encoded))]

In [5]:
# 4. Preparing the dataset
vocab_size = len(tokenizer.word_index) + 1
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)

In [6]:
# 5. Defining the Model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=sequence_length, trainable=True))
model.add(GRU(150, return_sequences=False))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

In [7]:
# 6. Training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5),
    ModelCheckpoint('model.h5', save_best_only=True, monitor='val_loss'),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001)
]

model.fit(X_tr, y_tr, epochs=50, batch_size=64, validation_data=(X_val, y_val), verbose=1, callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50


<keras.callbacks.History at 0x181ec08bfd0>

In [8]:
#Word Level Function
def generate_seq_beam_search(model, tokenizer, seq_length, seed_text, num_words, beam_width=3):
    sequences = [{'seq': seed_text, 'score': 1.0}]
    for _ in range(num_words):
        all_candidates = []
        for i in range(len(sequences)):
            seq, score = sequences[i]['seq'], sequences[i]['score']
            if len(seq) < seq_length:
                sequence = pad_sequences([seq], maxlen=seq_length, truncating='pre')
            else:
                sequence = np.array(seq[-seq_length:]).reshape(1, seq_length)
            
            preds = model.predict(sequence, verbose=0).flatten()
            top_indices = np.argsort(preds)[-beam_width:]

            for j in top_indices:
                candidate = [num for num in seq] + [j]
                candidate_score = score * preds[j]
                all_candidates.append({'seq': candidate, 'score': candidate_score})

        ordered = sorted(all_candidates, key=lambda tup: tup['score'], reverse=True)
        sequences = ordered[:beam_width]
    return sequences

In [9]:
# Define the text before and after the gap
text_before_gap = "Jueves veinte de".lower()
text_after_gap = "mil setecientos".lower()

# Estimate the gap length (this can be a rough estimate or based on context)
gap_length_estimate = 3  # let's say we expect three words to fill the gap

# Prepare the seed text for the gap generation
seed_text = text_before_gap
sequence_seed = tokenizer.texts_to_sequences([seed_text])[0]
seed_text_length = len(sequence_seed)

# Generate the sequence for the gap
results = generate_seq_beam_search(model, tokenizer, sequence_length, sequence_seed, gap_length_estimate, beam_width=3)
best_sequence = results[0]['seq']

# Convert the sequence of indices to words
predicted_gap_content = ' '.join(tokenizer.index_word.get(idx, '') for idx in best_sequence[seed_text_length:])

# Merge the content to complete the sentence
completed_sentence = text_before_gap + ' ' + predicted_gap_content + ' ' + text_after_gap
print(completed_sentence)

jueves veinte de tres enero mil mil setecientos
