# Next Word Prediction Model
# This script preprocesses text data, trains an LSTM model for next word prediction, and demonstrates the model's capability to generate text. It includes data augmentation, memory-efficient processing, and uses custom metrics for evaluation.

Note: This script is designed to run on Google Colab with GPU acceleration.

## Import Libraries


In [1]:
import os
import string
import re
from typing import List, Tuple
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tqdm.auto import tqdm
from nltk.corpus import wordnet
import nltk
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Download required NLTK data
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [3]:
# Ensure TensorFlow is using GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.list_physical_devices('GPU')

Num GPUs Available:  1


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Define Helper Functions

In [4]:
def load_doc(filename: str) -> str:
    """
    Load document from file.

    Args:
        filename (str): Path to the file.

    Returns:
        str: Content of the file.
    """
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def clean_doc(doc: str) -> List[str]:
    """
    Clean the document by removing punctuation and converting to lowercase.

    Args:
        doc (str): Input document.

    Returns:
        List[str]: List of cleaned tokens.
    """
    doc = doc.replace('--', ' ')
    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

def tokenize_twitter(sentences: List[str]) -> List[str]:
    """
    Tokenize and clean sentences with Twitter-specific processing.

    Args:
        sentences (List[str]): List of input sentences.

    Returns:
        List[str]: List of cleaned sentences.
    """
    print("Starting Cleaning Process")

    def cleanhtml(raw_html: str) -> str:
        cleanr = re.compile('<.*?>')
        return re.sub(cleanr, '', raw_html)

    def _replace_urls(data: str) -> str:
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', data)

    def remove_email(data: str) -> str:
        return re.sub('\S*@\S*\s?', '', data)

    def misc(data: str) -> str:
        data = re.sub('\s+', ' ', data)
        data = re.sub("\'", "", data)
        data = re.sub("ww+", "", data)
        MAYBE_ROMAN = re.compile(r'(\b[MDCLXVI]+\b)(\.)?', re.I)
        data = re.sub(MAYBE_ROMAN, "", data)
        return data

    tokenized_sentences = []
    for sentence in tqdm(sentences, desc="Cleaning sentences", ncols=100):
        sentence = cleanhtml(sentence)
        sentence = _replace_urls(sentence)
        sentence = remove_email(sentence)
        sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
        sentence = sentence.lower()
        sentence = misc(sentence)
        tokenized_sentences.append(sentence)

    return tokenized_sentences

def littleCleaning(sentences: List[str]) -> List[str]:
    """
    Remove sentences with less than 5 words.

    Args:
        sentences (List[str]): List of input sentences.

    Returns:
        List[str]: Filtered list of sentences.
    """
    print("Starting cleaning Process")
    return [sentence for sentence in tqdm(sentences, desc="Filtering sentences", ncols=100) if len(sentence.split()) > 5]

## Data Preparation Functions

In [5]:
def normalization_pipeline(sentences: List[str]) -> List[str]:
    """
    Apply normalization pipeline to the sentences.

    Args:
        sentences (List[str]): List of input sentences.

    Returns:
        List[str]: Normalized sentences.
    """
    print("Starting Normalization Process")
    sentences = tokenize_twitter(sentences)
    sentences = littleCleaning(sentences)
    print("Normalization Process Finished")
    return sentences

def get_synonyms(word: str) -> List[str]:
    """
    Get synonyms for a given word.

    Args:
        word (str): Input word.

    Returns:
        List[str]: List of synonyms.
    """
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym != word:
                synonyms.add(synonym)
    return list(synonyms)

def synonym_replacement(sentence: str, n: int) -> str:
    """
    Replace words in a sentence with their synonyms.

    Args:
        sentence (str): Input sentence.
        n (int): Number of words to replace.

    Returns:
        str: Sentence with replaced words.
    """
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = np.random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

def augment_data(sentences: List[str], augment_factor: int = 2) -> List[str]:
    """
    Augment data by replacing words with synonyms.

    Args:
        sentences (List[str]): List of input sentences.
        augment_factor (int): Number of augmented sentences to create for each input sentence.

    Returns:
        List[str]: Augmented list of sentences.
    """
    augmented_sentences = []
    for sentence in tqdm(sentences, desc="Augmenting data", ncols=100):
        augmented_sentences.append(sentence)
        for _ in range(augment_factor - 1):
            augmented_sentence = synonym_replacement(sentence, n=int(len(sentence.split()) * 0.1))
            augmented_sentences.append(augmented_sentence)
    return augmented_sentences

def prepare_sequences(tokens: List[str], seq_length: int) -> Tuple[np.ndarray, np.ndarray, Tokenizer, int]:
    """
    Prepare sequences for model training.

    Args:
        tokens (List[str]): List of tokens.
        seq_length (int): Length of each sequence.

    Returns:
        Tuple[np.ndarray, np.ndarray, Tokenizer, int]: Input sequences, target words, fitted tokenizer, and vocabulary size.
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([' '.join(tokens)])

    sequences = []
    for i in range(seq_length, len(tokens)):
        seq = tokens[i-seq_length:i]
        sequences.append(' '.join(seq))

    sequences = tokenizer.texts_to_sequences(sequences)

    vocab_size = len(tokenizer.word_index) + 1
    sequences = np.array(sequences)
    X, y = sequences[:, :-1], sequences[:, -1]
    y = keras.utils.to_categorical(y, num_classes=vocab_size)

    return X, y, tokenizer, vocab_size

## Model Creation and Training

In [6]:
def create_model(vocab_size: int, seq_length: int) -> Sequential:
    """
    Create and compile the LSTM model with regularization and dropout.

    Args:
        vocab_size (int): Size of the vocabulary.
        seq_length (int): Length of input sequences.

    Returns:
        Sequential: Compiled Keras model.
    """
    model = Sequential([
        Embedding(vocab_size, 100, input_length=seq_length),
        Bidirectional(LSTM(100, return_sequences=True, kernel_regularizer=l2(1e-5))),
        Dropout(0.2),
        Bidirectional(LSTM(100, kernel_regularizer=l2(1e-5))),
        Dropout(0.2),
        Dense(100, activation='relu', kernel_regularizer=l2(1e-5)),
        Dropout(0.2),
        Dense(vocab_size, activation='softmax')
    ])
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

def generate_seq(model: Sequential, tokenizer: Tokenizer, seq_length: int, seed_text: str, n_words: int) -> str:
    """
    Generate a sequence of words from the trained model.

    Args:
        model (Sequential): Trained Keras model.
        tokenizer (Tokenizer): Fitted tokenizer.
        seq_length (int): Length of input sequences.
        seed_text (str): Initial text to start generation.
        n_words (int): Number of words to generate.

    Returns:
        str: Generated text.
    """
    result = []
    in_text = seed_text
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        predict_x = model.predict(encoded, verbose=0)
        yhat = np.argmax(predict_x, axis=1)
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

## Main Execution

In [9]:
# Download the data
Data = !wget https://www.gutenberg.org/ebooks/1497.txt.utf-8

In [10]:
# Load and preprocess data
path = '/content/1497.txt.utf-8'
text = load_doc(path)
print('Length of the corpus:', len(text))

Length of the corpus: 1213712


In [11]:
data_list = text.split(".")
pro_sentences = normalization_pipeline(data_list)
print("Number of processed sentences:", len(pro_sentences))
print("Sample processed sentences:", pro_sentences[:5])

Starting Normalization Process
Starting Cleaning Process


Cleaning sentences:   0%|                                                  | 0/7313 [00:00<?, ?it/s]

Starting cleaning Process


Filtering sentences:   0%|                                                 | 0/7313 [00:00<?, ?it/s]

Normalization Process Finished
Number of processed sentences: 6390
Sample processed sentences: [' the project gutenberg ebook of the republic this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever', ' you may copy it give it away or re use it under the terms of the project gutenberg license included with this ebook or online at ', ' if you are not located in the united states you will have to check the laws of the country where you are located before using this ebook', ' title the republic author plato translator benjamin jowett release date october ebook most recently updated september language english credits sue asscher and david widger start of the project gutenberg ebook the republic the republic by plato translated by benjamin jowett note see also the republic by plato jowett ebook contents introduction and analysis', ' the republic of plato is the longest of his works with the exce

In [12]:
# Data Augmentation
augmented_sentences = augment_data(pro_sentences)
print("Number of sentences after augmentation:", len(augmented_sentences))

Augmenting data:   0%|                                                     | 0/6390 [00:00<?, ?it/s]

Number of sentences after augmentation: 12780


In [13]:
# Use all processed sentences
dataText = " ".join(augmented_sentences)
print("Length of preprocessed text:", len(dataText))
print("Sample of preprocessed text:", dataText[:200])

Length of preprocessed text: 2393923
Sample of preprocessed text:  the project gutenberg ebook of the republic this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever t


In [14]:
# Clean document and prepare sequences
tokens = clean_doc(dataText)
print('Total Tokens:', len(tokens))
print('Unique Tokens:', len(set(tokens)))
print("Sample tokens:", tokens[:20])

Total Tokens: 438598
Unique Tokens: 12653
Sample tokens: ['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states']


In [15]:
seq_length = 51  # 50 input words + 1 output word
X, y, tokenizer, vocab_size = prepare_sequences(tokens, seq_length)
print("Input shape:", X.shape)
print("Output shape:", y.shape)
print("Vocabulary size:", vocab_size)

Input shape: (438547, 50)
Output shape: (438547, 12654)
Vocabulary size: 12654


In [16]:
# Create and train the model
model = create_model(vocab_size, seq_length-1)

history = model.fit(
    X, y,
    epochs=100,
    batch_size=1024,
    verbose=1
)

Epoch 1/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 54ms/step - accuracy: 0.0730 - loss: 6.8627
Epoch 2/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 54ms/step - accuracy: 0.1167 - loss: 5.9787
Epoch 3/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 54ms/step - accuracy: 0.1337 - loss: 5.7928
Epoch 4/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 54ms/step - accuracy: 0.1476 - loss: 5.6638
Epoch 5/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 54ms/step - accuracy: 0.1563 - loss: 5.5415
Epoch 6/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 54ms/step - accuracy: 0.1597 - loss: 5.4588
Epoch 7/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 54ms/step - accuracy: 0.1628 - loss: 5.3890
Epoch 8/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 54ms/step - accuracy: 0.1665 - loss: 5.3140
Epoch 9/100
[1m

In [17]:
# Save the model and tokenizer
model.save("next_word_prediction_model.keras")
joblib.dump(tokenizer, 'tokenizer.joblib')

['tokenizer.joblib']

In [19]:
# Generate new text using the trained model
seed_text = " ".join(tokens[:50])
next_words = generate_seq(model, tokenizer, seq_length-1, seed_text, 6)
print("Seed text:", seed_text)
print("Next 6 words:", next_words)

Seed text: the project gutenberg ebook of the republic this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever the picture gutenberg ebook of the republic this ebook is for the use of
Next 6 words: anyone anywhere in the united states


In [20]:
# Demonstrate the prediction in context
full_text = seed_text + " " + next_words
print("\nFull text with prediction:")
print(full_text)


Full text with prediction:
the project gutenberg ebook of the republic this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever the picture gutenberg ebook of the republic this ebook is for the use of anyone anywhere in the united states
