<a href="https://colab.research.google.com/github/shashankt1/Text-generative-Model/blob/main/Text_Gen_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
import nltk
from nltk.corpus import gutenberg

In [71]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [72]:
import requests
url = "https://www.gutenberg.org/files/1342/1342-0.txt"
response = requests.get(url)

In [73]:
with open("pride_and_prejudice.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

In [74]:
with open("pride_and_prejudice.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [75]:
import string
import numpy as np

In [76]:
def preprocess_text(text):

    text = text.lower()

    text = text.translate(str.maketrans("", "", string.punctuation))

    tokens = text.split()
    return tokens

tokens = preprocess_text(text)


In [77]:
def create_sequences(tokens, seq_length):
    sequences = []
    for i in range(len(tokens) - seq_length):
        sequences.append(tokens[i:i + seq_length + 1])
    return sequences

seq_length = 30
sequences = create_sequences(tokens, seq_length)

In [78]:
vocab = sorted(set(tokens))
vocab_size = len(vocab)

token_to_int = {token: i for i, token in enumerate(vocab)}
int_to_token = {i: token for i, token in enumerate(vocab)}
encoded_sequences = [[token_to_int[token] for token in seq] for seq in sequences]
X = np.array([seq[:-1] for seq in encoded_sequences])
y = np.array([seq[1:] for seq in encoded_sequences])

In [79]:
import tensorflow as tf


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [80]:
encoded_sequences = [[token_to_int[token] for token in seq] for seq in sequences]
X = np.array([seq[:-1] for seq in encoded_sequences])
y = np.array([seq[-1] for seq in encoded_sequences])

In [81]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [82]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 56ms/step - accuracy: 0.0426 - loss: 6.9158 - val_accuracy: 0.0696 - val_loss: 6.1749
Epoch 2/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 57ms/step - accuracy: 0.0837 - loss: 5.9427 - val_accuracy: 0.1005 - val_loss: 5.8630
Epoch 3/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 58ms/step - accuracy: 0.1172 - loss: 5.3840 - val_accuracy: 0.1108 - val_loss: 5.8392
Epoch 4/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 55ms/step - accuracy: 0.1432 - loss: 4.9743 - val_accuracy: 0.1150 - val_loss: 5.8904
Epoch 5/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 56ms/step - accuracy: 0.1704 - loss: 4.6189 - val_accuracy: 0.1119 - val_loss: 6.0437
Epoch 6/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 54ms/step - accuracy: 0.2042 - loss: 4.2728 - val_accuracy: 0.1069 - val_loss: 6.3356


<keras.src.callbacks.history.History at 0x784717d6ace0>

In [83]:
def prepare_seed(seed_text, token_to_int):
    seed_tokens = seed_text.lower().split()
    return [token_to_int[token] for token in seed_tokens if token in token_to_int]

In [84]:
def generate_text(model, start_seed, gen_length):
    result = []
    input_sequence = prepare_seed(start_seed, token_to_int)

    for _ in range(gen_length):
        padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
            [input_sequence], maxlen=seq_length, truncating='pre'
        )
        predicted = model.predict(padded_sequence, verbose=0)
        next_token = np.random.choice(range(vocab_size), p=predicted[0])

        input_sequence.append(next_token)
        result.append(int_to_token[next_token])
        input_sequence = input_sequence[1:]

    return ' '.join(result)

In [None]:
import time
seed_text = "it is a truth universally acknowledged"

try:
    while True:
        generated_text = generate_text(model, seed_text, 10)
        print("Generated Text:")
        print(generated_text)
        time.sleep(10)
except KeyboardInterrupt:
    print("Text generation stopped.")

Generated Text:
have expressions near arranged engaged his father never have dancing
Generated Text:
justify in character street each take impenetrably comforts between this
Generated Text:
exertion had much own word never you be exasperate austenâs
Generated Text:
readily yet not think last cousins dare hear checking opposed
Generated Text:
love its gentlemenâs exposing her âwhat be happy offer has
Generated Text:
all naturalness an happy paint whither do be suddenly certainly
Generated Text:
nothing âhe should be think part till is only crammed
Generated Text:
not all liked lambton town heaven sell gratitude must have
Generated Text:
spoken not yawn aloud 472 elizabeth hoped well collins improved
Generated Text:
mrs bennet anybody value first wickhamâs imaginations fancy them incredible
Generated Text:
girl miss she justice went imposed she stood sight pleased
Generated Text:
may philips bent you make expect infinitely application was very
Generated Text:
a bennetâs annou