In [1]:
import numpy as np
import re
import heapq
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [2]:
with open('training_update.txt', 'r', encoding='utf-8') as file:
    training_data = file.read()



def text_cleaner(text):
    newString = text.lower()
    newString = re.sub("[^a-zA-Zñáéíóúü ]", " ", newString)
    newString = re.sub('\s+', ' ', newString)
    long_words = [word for word in newString.split() if len(word) >= 1]
    return " ".join(long_words).strip()

def split_into_segments(input_data):
    segments = input_data.split('</entry>')
    cleaned_segments = [re.sub(r'<.*?>', '', segment).strip() for segment in segments if segment.strip()]
    return cleaned_segments


segments = split_into_segments(training_data)
cleaned_segments = [text_cleaner(segment) for segment in segments]

print(cleaned_segments)

['número pablo ayende maria josefa gomés en la ciudad de la havana en diez y ocho de julio de mil ochocientos y doce años haviendose leydo las tres canoni cas amo staciones en tres dias festivos sin resultar im pedimen yo licenciado don andres cascales beneficiado por s majestad de la iglesia auxiliar del santo angel custodio de esta ciudad de la havana case y velé ritualmente á pablo ayende ilegítimo de esta ciudad hijo legítimo de josef antonio y de maria de la soledad olibos viudo de maria del carmen fernandez y a maria josefa catarina gomes de la misma naturalidad hija legítima de juan bautista y de felipa carvajal todos pardos libres y dhos contrayentes confesaron y comulgaron fueron examinados en la doctrina cristiana siendo padrinos eugenio gomes y maria del rosa y testigos don francisco cortinas y don gabriel garcia sacrista nes menores y lo firme licenciado andres cascales', 'n josef rafael kongo juana leandra henrique en la ciudad de la havana en tres de agosto de milochocien

In [3]:
# 2. Tokenizing the text into words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned_segments])
encoded = tokenizer.texts_to_sequences([cleaned_segments])[0]

In [4]:
# 3. Creating Sequences of Tokens
sequence_length = 5  # Length of the word sequences
sequences = [encoded[i - sequence_length:i+1] for i in range(sequence_length, len(encoded))]

In [5]:
# 4. Preparing the dataset
vocab_size = len(tokenizer.word_index) + 1
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)

In [6]:
# 5. Defining the Model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=sequence_length, trainable=True))
model.add(GRU(150, return_sequences=False))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

In [7]:
# 6. Training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5),
    ModelCheckpoint('model.h5', save_best_only=True, monitor='val_loss'),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001)
]

model.fit(X_tr, y_tr, epochs=50, batch_size=64, validation_data=(X_val, y_val), verbose=1, callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


<keras.callbacks.History at 0x205f3c17160>

In [8]:
#Word Level Function
def generate_seq_beam_search(model, tokenizer, seq_length, seed_text, num_words, beam_width=3):
    sequences = [{'seq': seed_text, 'score': 1.0}]
    for _ in range(num_words):
        all_candidates = []
        for i in range(len(sequences)):
            seq, score = sequences[i]['seq'], sequences[i]['score']
            if len(seq) < seq_length:
                sequence = pad_sequences([seq], maxlen=seq_length, truncating='pre')
            else:
                sequence = np.array(seq[-seq_length:]).reshape(1, seq_length)
            
            preds = model.predict(sequence, verbose=0).flatten()
            top_indices = np.argsort(preds)[-beam_width:]

            for j in top_indices:
                candidate = [num for num in seq] + [j]
                candidate_score = score * preds[j]
                all_candidates.append({'seq': candidate, 'score': candidate_score})

        ordered = sorted(all_candidates, key=lambda tup: tup['score'], reverse=True)
        sequences = ordered[:beam_width]
    return sequences

In [9]:
def generate_text_for_gap(model, tokenizer, sequence_length, text_before_gap, text_after_gap, gap_length_estimate=3, beam_width=3):
    #lower case
    text_before_gap = text_before_gap.lower()
    text_after_gap = text_after_gap.lower()

    seed_text = text_before_gap
    sequence_seed = tokenizer.texts_to_sequences([seed_text])[0]
    seed_text_length = len(sequence_seed)

    results = generate_seq_beam_search(model, tokenizer, sequence_length, sequence_seed, gap_length_estimate, beam_width)
    best_sequence = results[0]['seq']

    predicted_gap_content = ' '.join(tokenizer.index_word.get(idx, '') for idx in best_sequence[seed_text_length:])

    completed_sentence = text_before_gap + ' ' + predicted_gap_content + ' ' + text_after_gap
    return completed_sentence

text_before_gap = input("Enter the texts before the gap: ")
text_after_gap = input("Enter the texts after the gap: ")
generated_text = generate_text_for_gap(model, tokenizer, sequence_length, text_before_gap, text_after_gap)
print(generated_text)

Enter the texts before the gap: Jueves veinte de
Enter the texts after the gap: mil setecientos
jueves veinte de sebastiana aos vinte e tres de abril de mil e seiscentos e oitenta e sete baptizei e pus os santos oelos a sebastiana filha de miguel e de sua mulher ignacia nunes for o padrinhos domingos correa e da matheus de numo sábado dies de mayo de mil ochocientos setenta y pablo macario her nandes tres yo presbítero dn santiago serra cura párroco vicario forá neo interino de esta yglesia de término de san cárlos de ma tánzas bauticé solemnemente y puse los sántos óleos á un párvulo que nació el dia quince de enero último hijo de padre no conocido y de la morena libre simona hernan des natural y vecina de esta feligresía nieto materno de tomas y de julia hernandes en dicho párvulo ejercí las sagrádas préces y ceremónias y le puse por nombre pa blo macario fueron sus padrinos blas martines y ba silia delfi á quienes advertí el parentesco espiritual y o bligaciones que contrajeron y lo

In [13]:
from nltk.util import ngrams
from collections import Counter

In [14]:
# Step 1: Generate text using your existing model
text_before_gap = input("Enter the texts before the gap: ")
text_after_gap = input("Enter the texts after the gap: ")
generated_text = generate_text_for_gap(model, tokenizer, sequence_length, text_before_gap, text_after_gap)

# Step 2: Apply post-processing to remove repetitive content
def remove_near_duplicate_content(text, n=3, threshold=0.5):
    sentences = nltk.sent_tokenize(text)
    unique_sentences = []
    seen_ngrams = Counter()

    for sentence in sentences:
        sentence_ngrams = list(ngrams(sentence.split(), n=n))
        common_ngrams = sum(seen_ngrams[ng] for ng in sentence_ngrams)
        if common_ngrams / max(len(sentence_ngrams), 1) < threshold:
            unique_sentences.append(sentence)
            seen_ngrams.update(sentence_ngrams)

    return ' '.join(unique_sentences)

cleaned_text = remove_near_duplicate_content(generated_text)

# Step 3: Output the final, cleaned text
print(cleaned_text)

Enter the texts before the gap: Jueves veinte de
Enter the texts after the gap: mil setecientos
jueves veinte de sebastiana aos vinte e tres de abril de mil e seiscentos e oitenta e sete baptizei e pus os santos oelos a sebastiana filha de miguel e de sua mulher ignacia nunes for o padrinhos domingos correa e da matheus de numo sábado dies de mayo de mil ochocientos setenta y pablo macario her nandes tres yo presbítero dn santiago serra cura párroco vicario forá neo interino de esta yglesia de término de san cárlos de ma tánzas bauticé solemnemente y puse los sántos óleos á un párvulo que nació el dia quince de enero último hijo de padre no conocido y de la morena libre simona hernan des natural y vecina de esta feligresía nieto materno de tomas y de julia hernandes en dicho párvulo ejercí las sagrádas préces y ceremónias y le puse por nombre pa blo macario fueron sus padrinos blas martines y ba silia delfi á quienes advertí el parentesco espiritual y o bligaciones que contrajeron y lo