In [17]:
import numpy as np

def load_data():
    with open('shakespeare-2.txt', mode='r', encoding='utf-8') as f:
        data = f.read()  # data is a string that contains the text file
    return data


In [18]:
data = load_data()
words = data.split()
distinct_words = sorted(list(set(words)))  # vocabulary
word_to_idx = dict((word, i) for i, word in enumerate(distinct_words))  # each word has an index
idx_to_word = dict((i, word) for i, word in enumerate(distinct_words))  # each index has a word. useful for text generation


In [19]:
# Define constants
N_seq = 50  # Length of the input sequence to be fed
N_words = len(words)
N_vocab = len(distinct_words)
print(N_words, N_vocab)


18582 5235


In [20]:
x_train = []
y_train = []
for i in range(0, N_words - N_seq, 1):
    # Given x of 50 words (Input Sequence), predict the next word y (Conditional Probability)
    x = words[i:i+N_seq]
    y = words[i+N_seq]
    x_train.append([word_to_idx[x_i] for x_i in x])
    y_train.append(word_to_idx[y])

m = len(x_train)
assert m == len(y_train), "Length mismatch error"


In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Convert x_train (list of lists of word indices) directly to numpy array of integers
x_train = np.array(x_train, dtype=np.int64)  # shape (m, N_seq)

# Convert y_train (list of word indices) directly to numpy array of integers
y_train = np.array(y_train, dtype=np.int64)  # shape (m,)


class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers=3):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)  # shape: (batch, seq_len, embedding_dim)
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

# Example instantiation:
embedding_dim = 128
hidden_size = 512
model = LSTMModel(vocab_size=N_vocab, embedding_dim=embedding_dim, hidden_size=hidden_size, output_size=N_vocab)


In [22]:
import torch.optim as optim
import torch.nn as nn

# Assuming `model` is your instantiated PyTorch model

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()  # expects raw logits and class indices as targets

# Since y_train is already integer indices (not one-hot), no need to convert
# So you can directly use y_train as targets during training

# Make sure your model forward returns raw logits (no softmax)
# e.g. in model.forward():
#   return out  # no softmax applied here


In [23]:
import torch

PATH_SAVE = "shakespearean_generator_2.pth"  # PyTorch convention for saved models

def save_checkpoint(model, optimizer, epoch, loss, path=PATH_SAVE):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, path)


In [None]:
from torch.utils.data import TensorDataset, DataLoader
import torch

# Convert numpy arrays to tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.long)  # integer indices for embedding lookup
y_train_tensor = torch.tensor(y_train, dtype=torch.long)  # already integer class labels

# Create DataLoader for batching
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

num_epochs = 30
best_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)  # inputs shape: (batch, seq_len), outputs: (batch, num_classes)
        loss = criterion(outputs, labels)  # labels are class indices
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f}")

    # Save checkpoint if loss improved
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        save_checkpoint(model, optimizer, epoch, best_loss)


Epoch 1/30 - Loss: 7.4962
Epoch 2/30 - Loss: 7.1304


Text Generation

In [None]:
def generate(seed_words, N_words):
    """
    seed_words: list of initial words (strings)
    N_words: number of new words to generate
    """
    model.eval()  # set to evaluation mode

    # Convert seed words to indices
    x0 = [word_to_idx[word] for word in seed_words]
    generated_indices = x0.copy()

    for _ in range(N_words):
        # Prepare input as tensor of shape (1, N_seq)
        x_tensor = torch.tensor([x0], dtype=torch.long)  # batch size 1

        with torch.no_grad():
            logits = model(x_tensor)  # raw logits, shape (1, N_vocab)
            probs = F.softmax(logits, dim=1).cpu().numpy().ravel()

        # Sample next word index from probability distribution
        idx = np.random.choice(N_vocab, p=probs)

        generated_indices.append(idx)

        # Slide the window: drop first word, append new word idx
        x0 = x0[1:] + [idx]

    generated_words = [idx_to_word[i] for i in generated_indices]
    return ' '.join(generated_words)


In [None]:
initial_seed = "your awesome character is very powerful today".lower()
seed_words = initial_seed.split()

# Ensure all words are in the vocabulary
words_input = set(seed_words)
words_valid = set(word_to_idx.keys())
invalid_words = words_input.difference(words_valid)
if invalid_words:
    raise SyntaxError(f"Input contains invalid words: {invalid_words}")

# Truncate long sequences
if len(seed_words) > N_seq:
    seed_words = seed_words[-N_seq:]  # keep the last N_seq words

# Pad short sequences with a special token or just ' ' (space)
N_pad = max(N_seq - len(seed_words), 0)

# Check if <PAD> token exists; if not, use space or another token in vocab
pad_token = '<PAD>' if '<PAD>' in word_to_idx else ' '

seed_words = [pad_token] * N_pad + seed_words

print("The seed words are:", seed_words)


In [None]:
seed = [word_to_idx[word] for word in seed_words]

In [None]:
generated_sentence = generate(seed_words, 500)[N_pad:]  # Remove the prepended padding, if any

In [None]:
generated_sentence = ' '.join([idx_to_word[i] for i in generated_sentence])
print(generated_sentence)

In [None]:
torch.save(model.state_dict(), 'shakespeare_final.pth')

In [None]:
model = LSTMModel(input_size, hidden_size, output_size, num_layers=3)
model.load_state_dict(torch.load('shakespeare_final.pth'))
model.eval()  # set to evaluation mode