In [1]:
import numpy as np
import re
import heapq
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

In [3]:
# 1. Text Cleaner
def text_cleaner(text):
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zñáéíóúü]", " ", newString)
    long_words=[]
    for i in newString.split():
        if len(i)>=3:
            long_words.append(i)
    return (" ".join(long_words)).strip()

In [4]:

with open('training_data.txt', 'r', encoding='utf-8') as file:
    training_data = file.read()

data_new = text_cleaner(training_data)

In [5]:
# 2. Creating Sequences
def create_seq(text, length = 30):
    sequences = list()
    for i in range(length, len(text)):
        seq = text[i-length:i+1]
        sequences.append(seq)
    return sequences

sequences = create_seq(data_new)

In [6]:
# 3. Character Mapping
chars = sorted(list(set(data_new)))
mapping = dict((c, i) for i, c in enumerate(chars))
reverse_mapping = {i: c for c, i in mapping.items()}  # Reverse mapping for efficiency

def encode_seq(seq):
    sequences = list()
    for line in seq:
        encoded_seq = [mapping[char] for char in line]
        sequences.append(encoded_seq)
    return sequences

sequences = encode_seq(sequences)

In [7]:
# 4. Preparing the dataset
vocab = len(mapping)
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab)
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# 5. Defining the Model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(150, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=Adam(learning_rate=0.01))

In [8]:
# 6. Training
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint('model.h5', save_best_only=True, save_weights_only=False, monitor='val_loss'),
             ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001)]

history = model.fit(X_tr, y_tr, epochs=50, batch_size=256, verbose=1, callbacks=callbacks, validation_data=(X_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50


In [9]:
# 7. Function for Text Generation using Beam Search
def generate_seq_beam_search(model, mapping, seq_length, seed_text, n_chars, beam_width=3):
    sequences = [{'seq': seed_text, 'score': 0.0}]
    for _ in range(n_chars):
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]['seq'], sequences[i]['score']
            encoded = [mapping[char] for char in seq]
            encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre').squeeze()
            pred = model.predict(np.array([encoded]), verbose=0)
            probas = np.exp(pred) / np.sum(np.exp(pred))
            top_k = heapq.nlargest(beam_width, zip(probas[0], list(range(len(probas[0])))))
            for j in range(len(top_k)):
                score_, idx = top_k[j]
                out_char = reverse_mapping[idx]
                candidate = {'seq': seq + out_char, 'score': score - np.log(score_)}
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup:tup['score'], reverse=True)
        sequences = ordered[:beam_width]
    return sequences

In [13]:
#Function Design for predicting missing text after a sentence
seed_text = "Juebes veinte tres de febrero de mil setecientos".lower()
num_chars_to_predict = 30
results = generate_seq_beam_search(model, mapping, 30, seed_text, num_chars_to_predict)
print(results[0]['seq'])

juebes veinte tres de febrero de mil setecientosticatusesla veinienoco exle co


In [14]:
#Predicting missing text in a gap 
# Define the text before and after the gap
text_before_gap = "Juebes veinte".lower()
text_after_gap = "de mil setecientos".lower()

# Estimate the gap length (this can be a rough estimate or based on context)
# Here, I'm using an arbitrary value; adjust as needed
gap_length_estimate = 15

# Generate content for the gap
results = generate_seq_beam_search(model, mapping, 30, text_before_gap, gap_length_estimate)
predicted_gap_content = results[0]['seq'][len(text_before_gap):]

# Merge the content
completed_sentence = text_before_gap + predicted_gap_content + text_after_gap
print(completed_sentence)

juebes veintecla mosaceninatde mil setecientos
