# Text Generation Model using Gen-AI Architecture.
## Using Advanced version of RNN (Recurrent Neural Networks) that is LSTM (Long-Short Term Memory).



- Outline/Structure of workflow:-
1. Download and Gather data.
2. Prepare the data for training.
3. Build a LSTM model
4. Train and Evaluate the model.
5. Generate Text.


### 1. We have collected the data from various sources.

### 2. Pre-Processing the text data

In [None]:
# Specify the file path and the word to remove
file_path = 'dialogues_text.txt'
word_to_remove = '__eou__'

# Read the file content
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Replace the specific word (this is case-sensitive)
updated_content = content.replace(word_to_remove, '')

# Write the updated content back to the file
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(updated_content)

print(f"The word '{word_to_remove}' has been removed from the file.")

In [None]:
# Load the text data.
with open('dialogues_text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [None]:
len(text)

In [None]:
import torchtext

- Some basic pre-processing using the regex module.

In [None]:
# Basic pre-processing..
import re

# Convert to lowercase
text = text.lower()

# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)

# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text).strip()

- Advanced pre-processing of the data using PyTorch.

In [None]:
# Importing the torchtext module
import torch
from torchtext.data.utils import get_tokenizer

# Making a tokenizer for tokenization of all the words in a text.
tokenizer = get_tokenizer('basic_english') # English word tokens..

In [None]:
# Fitting the tokenizer to the text for making the tokens.
tokens = tokenizer(text)

In [None]:
len(tokens)

- Building the vocabulary for text data.

In [None]:
# Defining the max padding
MAX_PADDING = 100

# Making padding function
def pad_token(tokens):
  if(len(tokens)) >= MAX_PADDING:
    return tokens[:MAX_PADDING]
  else:
    return tokens + [pad_token] * (MAX_PADDING - len(tokens))

In [None]:
# Implementing the vocabulary.

from torchtext.vocab import build_vocab_from_iterator # Vocab module

# Defining the vocabulary size.
VOCAB_SIZE = 50_000

# Some Special Conditions.
unk_token = "<unk>"
pad_token = "<pad>"

# Vocabulary
vocab = build_vocab_from_iterator([tokens], max_tokens=VOCAB_SIZE,
                                  specials=[unk_token, pad_token])

In [None]:
len(vocab)

In [None]:
vocab[unk_token]

In [None]:
# There is some basic implementation for handling the unknown tokens. By making the default index
vocab.set_default_index(vocab[unk_token])

In [None]:
vocab['rikin'] # Example.!

In [None]:
# Viewing the token's indices..
sample_view = vocab.lookup_indices(tokens)

In [None]:
sample_view[:10]

### Now we have to make the input and output sequences for our model..


In [None]:
import torch
import numpy

# Convert all tokens to indices using the vocabulary
encoded_text = [vocab[token] for token in tokens]

# Set the sequence length
sequence_length = 100

# Create input-output sequences
input_sequences = []
output_tokens = []

for i in range(sequence_length, len(encoded_text)):
    input_sequences.append(encoded_text[i-sequence_length:i])
    output_tokens.append(encoded_text[i])

# Convert to tensors
X = torch.tensor(input_sequences)
y = torch.tensor(output_tokens)

print('Input Shape:', X.shape)   # (num_sequences, sequence_length)
print('Output Shape:', y.shape)  # (num_sequences,)

- Making a TensorDataset and Dataloader..

In [None]:
from torch.utils.data import TensorDataset, DataLoader

# Create TensorDataset and DataLoader for the entire dataset
full_dataset = TensorDataset(X, y)
full_loader = DataLoader(full_dataset, batch_size=128, shuffle=True)

print('Number of Batches:', len(full_loader))


### Now we will make the custom class for LSTM Model..

In [None]:
import torch
import torch.nn as nn

class TextGenerationLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=256, num_layers=2): # Specifying the dimensions of embedding, hidden layers.
        super(TextGenerationLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])  # Predict the next token in the linear layer.
        return out, hidden

    def init_hidden(self, batch_size, device):
        # Initialize hidden states (h0, c0) with zeros
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return (h0, c0)

In [None]:
# Let's Instantiate the model..

# Instantiating the model
vocab_size = len(vocab)  # Vocabulary size from torchtext
model = TextGenerationLSTM(vocab_size)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(model)

## Loss and Optimization of the model.

In [None]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Using the cross-entropy loss.
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Using the Adam Optimization method/

In [None]:
# Training configuration
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0

    for inputs, targets in full_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Initialize hidden states
        hidden = model.init_hidden(inputs.size(0), device)

        # Forward pass
        outputs, hidden = model(inputs, hidden)

        # Calculate loss
        loss = criterion(outputs, targets)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(full_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

## Finally implementing the generate function for generating text.

In [None]:
import torch.nn.functional as F

# Set the model to evaluation mode
model.eval()

def generate_text(model, start_text, vocab, tokenizer, max_length=100, temperature=1.0):

    # Tokenize the start text
    tokens = tokenizer(start_text)
    input_seq = [vocab[token] for token in tokens]
    input_seq = torch.tensor(input_seq).unsqueeze(0).to(device)  # (1, seq_len)

    # Initialize hidden states
    hidden = model.init_hidden(input_seq.size(0), device)

    # Collect generated tokens
    generated_tokens = tokens.copy()

    model.eval()
    with torch.no_grad():
        for _ in range(max_length):
            # Forward pass
            output, hidden = model(input_seq, hidden)

            # Apply temperature to logits
            output = output / temperature
            probs = F.softmax(output, dim=-1).squeeze()

            # Sample the next token
            next_token_id = torch.multinomial(probs, num_samples=1).item()
            next_token = vocab.lookup_token(next_token_id)

            # Stop if end of sequence token is generated
            if next_token == '<eos>':
                break

            # Add the token to the generated sequence
            generated_tokens.append(next_token)

            # Update input sequence
            input_seq = torch.tensor([next_token_id]).unsqueeze(0).to(device)

    # Join tokens to form the final text
    generated_text = ' '.join(generated_tokens)
    return generated_text

# Use the function for generating text..

In [None]:
# Define the seed text to start generating
seed_text = "May I help you find something , sir "

# Generate text
generated_text = generate_text(model, seed_text, vocab, tokenizer, max_length=25, temperature=0.99)

# Display the generated text
print("\nGenerated Text:\n", generated_text)

In [36]:

torch.save(model, 'genai-Q&A.pth')