# Text Generator : Next Word Prediction

## Import libraries

In [89]:
import numpy as np
import pandas as pd
import random 
import pickle
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Activation
from keras.optimizers import RMSprop


## Load and preprocess text

In [90]:
text_df = pd.read_csv("Fake.csv")

In [91]:
text = list(text_df.text.values)
joined_text = " ".join(text)

## Tokenizer

In [92]:
#define number of words that the model will be trained on
partial_text = joined_text[:100000]

In [93]:
#tokenization
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [94]:
#Obtain unique tokens
unique_tokens = np.unique(tokens)
unique_token_index = {token : idx for idx, token in enumerate(unique_tokens)}

{'000': 0,
 '059': 1,
 '1': 2,
 '10': 3,
 '100': 4,
 '11': 5,
 '12': 6,
 '13': 7,
 '140': 8,
 '15': 9,
 '16': 10,
 '1600': 11,
 '17': 12,
 '18': 13,
 '19': 14,
 '1967': 15,
 '1987': 16,
 '1995': 17,
 '2': 18,
 '20': 19,
 '2000': 20,
 '2012': 21,
 '2015': 22,
 '2016': 23,
 '2016image': 24,
 '2016this': 25,
 '2016twitter': 26,
 '2017': 27,
 '2017after': 28,
 '2017all': 29,
 '2017are': 30,
 '2017austin': 31,
 '2017bloomberg': 32,
 '2017breaking': 33,
 '2017calm': 34,
 '2017can': 35,
 '2017carrollton': 36,
 '2017comment': 37,
 '2017could': 38,
 '2017country': 39,
 '2017did': 40,
 '2017director': 41,
 '2017disney': 42,
 '2017do': 43,
 '2017donald': 44,
 '2017featured': 45,
 '2017for': 46,
 '2017former': 47,
 '2017franken': 48,
 '2017garland': 49,
 '2017good': 50,
 '2017he': 51,
 '2017here': 52,
 '2017his': 53,
 '2017i': 54,
 '2017in': 55,
 '2017is': 56,
 '2017it': 57,
 '2017make': 58,
 '2017no': 59,
 '2017now': 60,
 '2017npr': 61,
 '2017outgoing': 62,
 '2017pic': 63,
 '2017pirro': 64,
 '201

In [95]:
# Define the number of words in each input sequence
n_words = 10

# Initialize lists to store input sequences and their corresponding next words
input_words = [] 
next_words = []

# Iterate over the tokens to create input-output pairs for training the model
for i in range(len(tokens) - n_words):
    # Extract a sequence of n_words words as input
    input_sequence = tokens[i:i+n_words]
    # Append the input sequence to the input_words list
    input_words.append(input_sequence)
    
    # Extract the next word after the input sequence
    next_word = tokens[i + n_words]
    # Append the next word to the next_words list
    next_words.append(next_word)


In [96]:
# Initialize arrays to store the input sequences and their corresponding output words in a one-hot encoded format
# X will store the input sequences, while y will store the corresponding output words
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)


In [97]:
# Iterate over each input sequence and its corresponding index using enumerate
for i, words in enumerate(input_words):
    # Iterate over each word in the input sequence and its corresponding index using enumerate
    for j, word in enumerate(words):
        # Set the corresponding index in X to 1 to indicate the presence of the word in the input sequence
        # The index is determined by the unique_token_index dictionary, which maps each word to its index in the unique_tokens list
        X[i, j, unique_token_index[word]] = 1
    
    # Set the corresponding index in y to 1 to indicate the presence of the next word in the output
    # The index is determined by the unique_token_index dictionary, which maps each word to its index in the unique_tokens list
    y[i, unique_token_index[next_words[i]]] = 1


## Model implementation and training

In [98]:
# Create a Sequential model
model = Sequential()

# Add an LSTM layer with 128 units, expecting input sequences of length n_words and with a vocabulary size of len(unique_tokens)
# This layer returns sequences, as indicated by return_sequences=True, which is necessary for the subsequent LSTM layer
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))

# Add another LSTM layer with 128 units
# This layer does not return sequences, so it will output a single vector representing the final state of the LSTM
model.add(LSTM(128))

# Add a Dense layer with a number of units equal to the size of the vocabulary (len(unique_tokens))
# This layer will output a vector of probabilities for each word in the vocabulary
model.add(Dense(len(unique_tokens)))

# Add an activation layer using the softmax function to convert the output into a probability distribution over the vocabulary
model.add(Activation("softmax"))

  super().__init__(**kwargs)


In [99]:
# Compile the model with categorical cross-entropy loss, RMSprop optimizer with a learning rate of 0.01, and accuracy metric
model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])

# Train the model using the input sequences (X) and their corresponding output words (y)
# Specify the batch size, number of epochs, and whether to shuffle the data during training
model.fit(X, y, batch_size=128, epochs=30, shuffle=True)

Epoch 1/30
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 166ms/step - accuracy: 0.0392 - loss: 7.1099
Epoch 2/30
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 168ms/step - accuracy: 0.0465 - loss: 6.7040
Epoch 3/30
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 172ms/step - accuracy: 0.0525 - loss: 6.5405
Epoch 4/30
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 177ms/step - accuracy: 0.0662 - loss: 6.3046
Epoch 5/30
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 202ms/step - accuracy: 0.0858 - loss: 6.0031
Epoch 6/30
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 193ms/step - accuracy: 0.1116 - loss: 5.6600
Epoch 7/30
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 201ms/step - accuracy: 0.1369 - loss: 5.3694
Epoch 8/30
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 216ms/step - accuracy: 0.1604 - loss: 5.0639
Epoch 9/30
[1m1

<keras.src.callbacks.history.History at 0x25ee1ab2790>

In [100]:
# Save model
model.save("text_generator.h5")



In [101]:
#Load model in case it is not done
model = load_model("text_generator.h5")



## Prediction for next word

In [102]:
def predict_next_word(input_text, n_best):
    # Convert input_text to lowercase
    input_text = input_text.lower()
    
    # Initialize an array to store the input sequence in a one-hot encoded format
    X = np.zeros((1, n_words, len(unique_tokens)))
    
    # Iterate over each word in the input text
    for i, word in enumerate(input_text.split()):
        # Encode each word in the input sequence using one-hot encoding
        # Set the corresponding index in X to 1 to indicate the presence of the word in the input sequence
        X[0, i, unique_token_index[word]] = 1
    
    # Make predictions using the model for the input sequence X
    predictions = model.predict(X)[0]
    
    # Get the indices of the top n_best predictions based on their probabilities
    top_indices = np.argpartition(predictions, -n_best)[-n_best:]
    
    # Return the indices of the top n_best predictions
    return top_indices

In [103]:
possible = predict_next_word("He said that", 5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 614ms/step


In [104]:
print([unique_tokens[idx] for idx in possible])

['president', 'all', 'well', 'a', 'had']


In [105]:
def generate_text(input_text, text_length, creativity=3):
    # Split the input_text into a list of words
    word_sequence = input_text.split()
    
    # Initialize a variable to keep track of the current position in the text
    current = 0

    # Iterate over the specified text_length to generate the desired length of text
    for _ in range(text_length):
        # Create a sub-sequence of words from the word_sequence
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower()))[current:current+n_words]

        try:
            # Try to predict the next word based on the sub-sequence using the predict_next_word function
            # Choose one of the top predictions with randomness controlled by the creativity parameter
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            # If an exception occurs (e.g., if predict_next_word returns an empty list),
            # choose a random word from the unique_tokens list
            choice = random.choice(unique_tokens)

        # Append the chosen word to the word_sequence
        word_sequence.append(choice)
        
        # Update the current position for the next sub-sequence
        current += 1

    # Join the word_sequence to form the generated text
    return " ".join(word_sequence)


In [112]:
generate_text("He said", 1, 1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


'He said me'