In [1]:
import numpy as np
import re
import heapq
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [3]:
with open('training_update.txt', 'r', encoding='utf-8') as file:
    training_data = file.read()


nltk.download('stopwords')


def text_cleaner(text):
    newString = text.lower()
    newString = re.sub("[^a-zA-Zñáéíóúü ]", " ", newString)
    newString = re.sub('\s+', ' ', newString)
    stop_words = set(stopwords.words('spanish'))
    long_words = [word for word in newString.split() if len(word) >= 3 and word not in stop_words]
    return " ".join(long_words).strip()

def split_into_segments(input_data):
    segments = input_data.split('</entry>')
    cleaned_segments = [re.sub(r'<.*?>', '', segment).strip() for segment in segments if segment.strip()]
    return cleaned_segments


segments = split_into_segments(training_data)
cleaned_segments = [text_cleaner(segment) for segment in segments]



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Suzreal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# 2. Tokenizing the text into words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned_segments])
encoded = tokenizer.texts_to_sequences([cleaned_segments])[0]

In [5]:
# 3. Creating Sequences of Tokens
sequence_length = 5  # Length of the word sequences
sequences = [encoded[i - sequence_length:i+1] for i in range(sequence_length, len(encoded))]

In [6]:
# 4. Preparing the dataset
vocab_size = len(tokenizer.word_index) + 1
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)

In [7]:
# 5. Defining the Model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=sequence_length, trainable=True))
model.add(GRU(150, return_sequences=False))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

In [8]:
# 6. Training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5),
    ModelCheckpoint('model.h5', save_best_only=True, monitor='val_loss'),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001)
]

model.fit(X_tr, y_tr, epochs=50, batch_size=64, validation_data=(X_val, y_val), verbose=1, callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


<keras.callbacks.History at 0x210411be940>

In [9]:
#Word Level Function
def generate_seq_beam_search(model, tokenizer, seq_length, seed_text, num_words, beam_width=3):
    sequences = [{'seq': seed_text, 'score': 1.0}]
    for _ in range(num_words):
        all_candidates = []
        for i in range(len(sequences)):
            seq, score = sequences[i]['seq'], sequences[i]['score']
            if len(seq) < seq_length:
                sequence = pad_sequences([seq], maxlen=seq_length, truncating='pre')
            else:
                sequence = np.array(seq[-seq_length:]).reshape(1, seq_length)
            
            preds = model.predict(sequence, verbose=0).flatten()
            top_indices = np.argsort(preds)[-beam_width:]

            for j in top_indices:
                candidate = [num for num in seq] + [j]
                candidate_score = score * preds[j]
                all_candidates.append({'seq': candidate, 'score': candidate_score})

        ordered = sorted(all_candidates, key=lambda tup: tup['score'], reverse=True)
        sequences = ordered[:beam_width]
    return sequences

In [10]:
def generate_text_for_gap(model, tokenizer, sequence_length, text_before_gap, text_after_gap, gap_length_estimate=3, beam_width=3):
    #lower case
    text_before_gap = text_before_gap.lower()
    text_after_gap = text_after_gap.lower()

    seed_text = text_before_gap
    sequence_seed = tokenizer.texts_to_sequences([seed_text])[0]
    seed_text_length = len(sequence_seed)

    results = generate_seq_beam_search(model, tokenizer, sequence_length, sequence_seed, gap_length_estimate, beam_width)
    best_sequence = results[0]['seq']

    predicted_gap_content = ' '.join(tokenizer.index_word.get(idx, '') for idx in best_sequence[seed_text_length:])

    completed_sentence = text_before_gap + ' ' + predicted_gap_content + ' ' + text_after_gap
    return completed_sentence

text_before_gap = input("Enter the texts before the gap: ")
text_after_gap = input("Enter the texts after the gap: ")
generated_text = generate_text_for_gap(model, tokenizer, sequence_length, text_before_gap, text_after_gap)
print(generated_text)

Enter the texts before the gap: Jueves veinte de
Enter the texts after the gap: mil setecientos
jueves veinte de josef maria rangel congo maria josefa criolla ciudad havana veinte seis octubre mil ochocientos doce años haviendose leydo tres canonicas amonestaciones tres días festivos resultar impedimento licenciado don andres cascales beneficiado iglesia auxiliar santo angel custodio ciudad havana case vele ritual mente josef maria rangel congo viudo maria dolores esclavo don antonio rangel maria josefa luisa criolla ciudad hija legítima agustin rafaela esclava don manuel apesechea dhos contrayentes confesaron comulgaron examinados doctrina cristiana siendo testigos don josef rafael morales don juan escobar padrinos josef cruz yglesias maria lus dias firme licenciado andres cascales josef maria rangel congo maria josefa criolla ciudad havana veinte seis octubre mil ochocientos doce años haviendose leydo tres canonicas amonestaciones tres días festivos resultar impedimento licenciado do