In [2]:
lines_of_text = 70

with open('story_data.txt', 'r', encoding='utf-8') as file:
    lines = [next(file) for _ in range(lines_of_text)]

# Join lines into a single string
text = ''.join(lines)
print('Sample Text:', text[:200])

Sample Text: The ways of God in Nature, as in Providence, are not as our ways; nor are the models that we frame any way commensurate to the vastness, profundity, and unsearchableness of His works, which have a dep


In [2]:
len(text)

1914347

In [3]:
# Some basic pre-processing
import re

# Convert to lowercase
text = text.lower()

# Remove special characters and digits (optional)
text = re.sub(r'[^a-zA-Z\s]', '', text)

# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text).strip()

In [4]:
# Advanced pre-processing and vectorisation
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Tokenize the text
tokenizer = get_tokenizer('basic_english')
tokens = tokenizer(text)

# Build the vocabulary
def yield_tokens(text):
    yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(text), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

print('Vocabulary Size:', len(vocab))


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\ASUS\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\ASUS\AppData\Roaming\Python\Python311\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\ASUS\AppData\Roaming\Python\Python311\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.

Vocabulary Size: 24551


In [5]:
import torch
import numpy as np

# Convert all tokens to indices using the vocabulary
encoded_text = [vocab[token] for token in tokens]

# Set the sequence length
sequence_length = 50


# Create input-output sequences
input_sequences = []
output_tokens = []

for i in range(sequence_length, len(encoded_text)):
    input_sequences.append(encoded_text[i-sequence_length:i])
    output_tokens.append(encoded_text[i])

# Convert to tensors
X = torch.tensor(input_sequences)
y = torch.tensor(output_tokens)

print('Input Shape:', X.shape)   # (num_sequences, sequence_length)
print('Output Shape:', y.shape)  # (num_sequences,)

Input Shape: torch.Size([329889, 50])
Output Shape: torch.Size([329889])


In [6]:
from torch.utils.data import TensorDataset, DataLoader

# Create TensorDataset and DataLoader for the entire dataset
full_dataset = TensorDataset(X, y)
full_loader = DataLoader(full_dataset, batch_size=64, shuffle=True)

print('Number of Batches:', len(full_loader))

Number of Batches: 5155


In [7]:
import torch
import torch.nn as nn

class TextGenerationLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=256, num_layers=2):
        super(TextGenerationLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])  # Predict the next token
        return out, hidden

    def init_hidden(self, batch_size, device):
        # Initialize hidden states (h0, c0) with zeros
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return (h0, c0)

# Instantiate the model
vocab_size = len(vocab)  # Vocabulary size from torchtext
model = TextGenerationLSTM(vocab_size)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(model)

TextGenerationLSTM(
  (embedding): Embedding(24551, 100)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=24551, bias=True)
)


In [8]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [9]:
# Training configuration
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0

    for inputs, targets in full_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Initialize hidden states
        hidden = model.init_hidden(inputs.size(0), device)
        
        # Forward pass
        outputs, hidden = model(inputs, hidden)
        
        # Calculate loss
        loss = criterion(outputs, targets)
        
        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    avg_loss = total_loss / len(full_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

Epoch [1/10], Loss: 7.7672
Epoch [2/10], Loss: 7.5131
Epoch [3/10], Loss: 7.8824
Epoch [4/10], Loss: 7.4394
Epoch [5/10], Loss: 7.1941
Epoch [6/10], Loss: 7.0639
Epoch [7/10], Loss: 7.0393
Epoch [8/10], Loss: 6.9908
Epoch [9/10], Loss: 6.9566
Epoch [10/10], Loss: 6.9405


In [10]:
import torch.nn.functional as F

# Set the model to evaluation mode
model.eval()

def generate_text(model, start_text, vocab, tokenizer, max_length=100, temperature=1.0):
   
    # Tokenize the start text
    tokens = tokenizer(start_text)
    input_seq = [vocab[token] for token in tokens]
    input_seq = torch.tensor(input_seq).unsqueeze(0).to(device)  # (1, seq_len)
    
    # Initialize hidden states
    hidden = model.init_hidden(input_seq.size(0), device)
    
    # Collect generated tokens
    generated_tokens = tokens.copy()
    
    model.eval()
    with torch.no_grad():
        for _ in range(max_length):
            # Forward pass
            output, hidden = model(input_seq, hidden)
            
            # Apply temperature to logits
            output = output / temperature
            probs = F.softmax(output, dim=-1).squeeze()
            
            # Sample the next token
            next_token_id = torch.multinomial(probs, num_samples=1).item()
            next_token = vocab.lookup_token(next_token_id)
            
            # Stop if end of sequence token is generated
            if next_token == '<eos>':
                break
            
            # Add the tokn to the generated sequence
            generated_tokens.append(next_token)
            
            # Update input sequence
            input_seq = torch.tensor([next_token_id]).unsqueeze(0).to(device)
    
    # Join tokens to formd the final text
    generated_text = ' '.join(generated_tokens)
    return generated_text

In [11]:
# Define the seed text to start generating
seed_text = "True! – nervous – very, very dreadfully nervous I had been and am; but why will you say that I am mad?"

# Generate text
generated_text = generate_text(model, seed_text, vocab, tokenizer, max_length=250, temperature=1.1)

# Display the generated text
print("\nGenerated Text:\n", generated_text)



Generated Text:
 true ! – nervous – very , very dreadfully nervous i had been and am but why will you say that i am mad ? of in uneasiness again would graves there will always finally waited why when assumed the more upon the the press he is shrouded engaged ten knew it effected lay of the natural very circumstances among pantaloons overboard the careless kisses perceive electrical were politics he had to no invention upon an firm have soon but on up and about which while he rode in its consequence it lie pretending just repeated mystified i picked afterwards did repeated frequently upon lucretius open our suit in the first decisionsthe have thought one still beyond good open a poor lips that had before skeered and in night now of armed ever absolutely visible de gaping among a tongues that was necessarily attempt person as which or color at my article contes all in example in the stomach it do to our bottom not diversity i had received it and will buried either gust was exceedingly c

## Last step is of deploying the model to streamlit..

In [13]:
torch.save(model, 'genai-stories.pth')