Build the model for.LSTM and rnn.GRU for the tiny Shakespeare dataset, the data loader code is already provided.

Train the models for the sequence of 20 and 30, report and compare training loss, validation accuracy, execution time for training, and computational and mode size complexities across the two models.
Adjust the hyperparameters (fully connected network, number of hidden layers, and the number of hidden states) and compare your results (training and validation loss, computation complexity, model size, training and inference time, and the output sequence). Analyze their influence on accuracy, running time, and computational perplexity.
What if we increase the sequence length to 50? Perform the training and report the accuracy and model complexity results.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import numpy as np
import requests
import time

# Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

# Prepare the dataset
sequence_length = 20
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}
encoded_text = [char_to_int[ch] for ch in text]

# Create sequences and targets
sequences = []
targets = []
for i in range(0, len(encoded_text) - sequence_length):
    seq = encoded_text[i:i+sequence_length]
    target = encoded_text[i+sequence_length]
    sequences.append(seq)
    targets.append(target)

# Convert to PyTorch tensors
sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

# Define the dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Split the dataset into training and testing sets
dataset = CharDataset(sequences, targets)
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create data loaders
batch_size = 128
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.gru(embedded)
        output = self.fc(output[:, -1, :])
        return output

In [18]:
import os
import sys

# Train the model with validation loss, model size, number of parameters, and inference time
def train_model(model, train_loader, test_loader, device, num_epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.to(device)
    start_time = time.time()

    # Calculate the number of trainable parameters
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'Number of trainable parameters: {num_params}')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        correct = 0
        total = 0
        test_loss = 0
        inference_start_time = time.time()  # Start timer for inference
        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                test_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
        inference_end_time = time.time()  # End timer for inference
        inference_time = inference_end_time - inference_start_time

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader)}, Test Loss: {test_loss/len(test_loader)}, Test Accuracy: {100 * correct / total}%, Inference Time: {inference_time} seconds')

    end_time = time.time()
    print(f'Total Training Time: {end_time - start_time} seconds')

    # Calculate model size
    model_size = sys.getsizeof(model.state_dict())
    print(f'Model Size: {model_size} bytes')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize and train the LSTM model
input_size = len(chars)
hidden_size = 256
output_size = len(chars)
num_epochs = 10
lr = 0.001

print("Training LSTM model...")
lstm_model = LSTMModel(input_size, hidden_size, output_size)
train_model(lstm_model, train_loader, test_loader, device, num_epochs, lr)

print("\nTraining GRU model...")
gru_model = GRUModel(input_size, hidden_size, output_size)
train_model(gru_model, train_loader, test_loader, device, num_epochs, lr)


Training LSTM model...
Number of trainable parameters: 559681
Epoch 1/10, Train Loss: 1.7076634001574569, Test Loss: 1.5441576732141973, Test Accuracy: 53.19107923344167%, Inference Time: 10.234908103942871 seconds
Epoch 2/10, Train Loss: 1.4844536945283653, Test Loss: 1.4690568940227768, Test Accuracy: 55.16485486943853%, Inference Time: 9.065066814422607 seconds
Epoch 3/10, Train Loss: 1.424019436503168, Test Loss: 1.430919986994049, Test Accuracy: 55.891964585901604%, Inference Time: 9.149778604507446 seconds
Epoch 4/10, Train Loss: 1.3885324373371082, Test Loss: 1.4122446166053286, Test Accuracy: 56.7091785274011%, Inference Time: 8.964710235595703 seconds
Epoch 5/10, Train Loss: 1.3638709488950511, Test Loss: 1.397402367780623, Test Accuracy: 57.01714669954051%, Inference Time: 8.97447919845581 seconds
Epoch 6/10, Train Loss: 1.344998960326064, Test Loss: 1.3912321806501124, Test Accuracy: 56.99114647540065%, Inference Time: 8.885128498077393 seconds
Epoch 7/10, Train Loss: 1.3298

In [22]:
# Initialize and train the LSTM model
input_size = len(chars)
hidden_size = 128
output_size = len(chars)
num_epochs = 10
lr = 0.01

print("Training LSTM model...")
lstm_model = LSTMModel(input_size, hidden_size, output_size)
train_model(lstm_model, train_loader, test_loader, device, num_epochs, lr)

print("\nTraining GRU model...")
gru_model = GRUModel(input_size, hidden_size, output_size)
train_model(gru_model, train_loader, test_loader, device, num_epochs, lr)

Training LSTM model...
Number of trainable parameters: 148801
Epoch 1/10, Train Loss: 1.8319664558014237, Test Loss: 1.7497204281582068, Test Accuracy: 48.02465538496021%, Inference Time: 3.2053937911987305 seconds
Epoch 2/10, Train Loss: 1.736948809077257, Test Loss: 1.7414866521723709, Test Accuracy: 48.492211139751205%, Inference Time: 3.190444231033325 seconds
Epoch 3/10, Train Loss: 1.7324021682271449, Test Loss: 1.737878772491569, Test Accuracy: 48.50521125182114%, Inference Time: 3.4379940032958984 seconds
Epoch 4/10, Train Loss: 1.7348074167368401, Test Loss: 1.7469942238025808, Test Accuracy: 48.314244088311106%, Inference Time: 3.2713723182678223 seconds
Epoch 5/10, Train Loss: 1.7375169511813922, Test Loss: 1.7604003551015892, Test Accuracy: 47.85475736859801%, Inference Time: 3.1926138401031494 seconds
Epoch 6/10, Train Loss: 1.7477148259705124, Test Loss: 1.7704925211557903, Test Accuracy: 47.39840860697075%, Inference Time: 3.3794829845428467 seconds
Epoch 7/10, Train Los

In [2]:
# Prepare the dataset with sequence length 30
sequence_length = 30
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}
encoded_text = [char_to_int[ch] for ch in text]

sequences = []
targets = []
for i in range(0, len(encoded_text) - sequence_length):
    seq = encoded_text[i:i+sequence_length]
    target = encoded_text[i+sequence_length]
    sequences.append(seq)
    targets.append(target)

sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

# Define the dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Split the dataset into training and testing sets
dataset = CharDataset(sequences, targets)
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create data loaders
batch_size = 128
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [3]:
def train_model(model, train_loader, test_loader, device, num_epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.to(device)
    start_time = time.time()

    train_losses = []
    val_losses = []
    val_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_losses.append(train_loss / len(train_loader))

        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
        val_losses.append(val_loss / len(test_loader))
        val_accuracies.append(100 * correct / total)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]}, Val Loss: {val_losses[-1]}, Val Accuracy: {val_accuracies[-1]}%')

    end_time = time.time()
    training_time = end_time - start_time
    print(f'Training time: {training_time} seconds')
    return train_losses, val_losses, val_accuracies, training_time, sum(p.numel() for p in model.parameters() if p.requires_grad)



In [4]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize and train the LSTM model
input_size = len(chars)
hidden_size = 256
output_size = len(chars)
num_epochs = 10
lr = 0.001

print("Training LSTM model...")
lstm_model = LSTMModel(input_size, hidden_size, output_size)
lstm_train_losses, lstm_val_losses, lstm_val_acc, lstm_training_time, lstm_model_size = train_model(lstm_model, train_loader, test_loader, device, num_epochs, lr)

print("\nTraining GRU model...")
gru_model = GRUModel(input_size, hidden_size, output_size)
gru_train_losses, gru_val_losses, gru_val_acc, gru_training_time, gru_model_size = train_model(gru_model, train_loader, test_loader, device, num_epochs, lr)

# Compare results
print("\nComparison of LSTM and GRU models:")
print(f"LSTM - Train Loss: {lstm_train_losses[-1]}, Val Loss: {lstm_val_losses[-1]}, Val Accuracy: {lstm_val_acc[-1]}%, Training Time: {lstm_training_time} seconds, Model Size: {lstm_model_size} parameters")
print(f"GRU - Train Loss: {gru_train_losses[-1]}, Val Loss: {gru_val_losses[-1]}, Val Accuracy: {gru_val_acc[-1]}%, Training Time: {gru_training_time} seconds, Model Size: {gru_model_size} parameters")

Training LSTM model...
Epoch 1/10, Train Loss: 1.6987307694918523, Val Loss: 1.5201707640668132, Val Accuracy: 53.98322522223667%
Epoch 2/10, Train Loss: 1.4741471164522675, Val Loss: 1.4466245311878223, Val Accuracy: 55.59749499042914%
Epoch 3/10, Train Loss: 1.4144343402337152, Val Loss: 1.420986441244562, Val Accuracy: 56.36406019554137%
Epoch 4/10, Train Loss: 1.377680996993938, Val Loss: 1.3944001756245148, Val Accuracy: 57.178591761441325%
Epoch 5/10, Train Loss: 1.3519627196277206, Val Loss: 1.3768365035368941, Val Accuracy: 57.65825536931857%
Epoch 6/10, Train Loss: 1.3322405070835874, Val Loss: 1.3689436094554348, Val Accuracy: 57.6905317990075%
Epoch 7/10, Train Loss: 1.3168419872592798, Val Loss: 1.3603249148124261, Val Accuracy: 58.07740067152905%
Epoch 8/10, Train Loss: 1.3043302338628062, Val Loss: 1.3557574289732806, Val Accuracy: 58.13388442348469%
Epoch 9/10, Train Loss: 1.2934165609871184, Val Loss: 1.350373524806996, Val Accuracy: 58.357129728833165%
Epoch 10/10, Tra

In [5]:
# Initialize and train the LSTM model
input_size = len(chars)
hidden_size = 128
output_size = len(chars)
num_epochs = 10
lr = 0.001

print("Training LSTM model...")
lstm_model = LSTMModel(input_size, hidden_size, output_size)
lstm_train_losses, lstm_val_losses, lstm_val_acc, lstm_training_time, lstm_model_size = train_model(lstm_model, train_loader, test_loader, device, num_epochs, lr)

print("\nTraining GRU model...")
gru_model = GRUModel(input_size, hidden_size, output_size)
gru_train_losses, gru_val_losses, gru_val_acc, gru_training_time, gru_model_size = train_model(gru_model, train_loader, test_loader, device, num_epochs, lr)

# Compare results
print("\nComparison of LSTM and GRU models:")
print(f"LSTM - Train Loss: {lstm_train_losses[-1]}, Val Loss: {lstm_val_losses[-1]}, Val Accuracy: {lstm_val_acc[-1]}%, Training Time: {lstm_training_time} seconds, Model Size: {lstm_model_size} parameters")
print(f"GRU - Train Loss: {gru_train_losses[-1]}, Val Loss: {gru_val_losses[-1]}, Val Accuracy: {gru_val_acc[-1]}%, Training Time: {gru_training_time} seconds, Model Size: {gru_model_size} parameters")

Training LSTM model...
Epoch 1/10, Train Loss: 1.8350824954450917, Val Loss: 1.628255730446768, Val Accuracy: 51.17203785307949%
Epoch 2/10, Train Loss: 1.5772494181982075, Val Loss: 1.537652634712587, Val Accuracy: 53.64656412923124%
Epoch 3/10, Train Loss: 1.5113313931777295, Val Loss: 1.5032436849738562, Val Accuracy: 54.39788768699036%
Epoch 4/10, Train Loss: 1.4729554098841109, Val Loss: 1.4710882837171877, Val Accuracy: 55.33973183666333%
Epoch 5/10, Train Loss: 1.4472309622253177, Val Loss: 1.4569780490758155, Val Accuracy: 55.629771420118075%
Epoch 6/10, Train Loss: 1.4284298277694605, Val Loss: 1.443425266344658, Val Accuracy: 55.890672560103646%
Epoch 7/10, Train Loss: 1.413957880939209, Val Loss: 1.4319413178143525, Val Accuracy: 56.40933685385502%
Epoch 8/10, Train Loss: 1.4023693522723735, Val Loss: 1.4265667987293849, Val Accuracy: 56.54158055883052%
Epoch 9/10, Train Loss: 1.3926018366287716, Val Loss: 1.4163671794186634, Val Accuracy: 56.70027300480112%
Epoch 10/10, Tra

In [7]:
# Prepare the dataset with sequence length 30
sequence_length = 50
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}
encoded_text = [char_to_int[ch] for ch in text]

sequences = []
targets = []
for i in range(0, len(encoded_text) - sequence_length):
    seq = encoded_text[i:i+sequence_length]
    target = encoded_text[i+sequence_length]
    sequences.append(seq)
    targets.append(target)

sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

# Define the dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Split the dataset into training and testing sets
dataset = CharDataset(sequences, targets)
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [8]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize and train the LSTM model
input_size = len(chars)
hidden_size = 128
output_size = len(chars)
num_epochs = 10
lr = 0.001

print("Training LSTM model...")
lstm_model = LSTMModel(input_size, hidden_size, output_size)
lstm_train_losses, lstm_val_losses, lstm_val_acc, lstm_training_time, lstm_model_size = train_model(lstm_model, train_loader, test_loader, device, num_epochs, lr)

print("\nTraining GRU model...")
gru_model = GRUModel(input_size, hidden_size, output_size)
gru_train_losses, gru_val_losses, gru_val_acc, gru_training_time, gru_model_size = train_model(gru_model, train_loader, test_loader, device, num_epochs, lr)

# Compare results
print("\nComparison of LSTM and GRU models:")
print(f"LSTM - Train Loss: {lstm_train_losses[-1]}, Val Loss: {lstm_val_losses[-1]}, Val Accuracy: {lstm_val_acc[-1]}%, Training Time: {lstm_training_time} seconds, Model Size: {lstm_model_size} parameters")
print(f"GRU - Train Loss: {gru_train_losses[-1]}, Val Loss: {gru_val_losses[-1]}, Val Accuracy: {gru_val_acc[-1]}%, Training Time: {gru_training_time} seconds, Model Size: {gru_model_size} parameters")

Training LSTM model...
Epoch 1/10, Train Loss: 1.7794938683125059, Val Loss: 1.5889111647165444, Val Accuracy: 52.078504857241484%
Epoch 2/10, Train Loss: 1.5394333560255402, Val Loss: 1.5149517819345237, Val Accuracy: 54.289031644917046%
Epoch 3/10, Train Loss: 1.479664434427819, Val Loss: 1.4759218515299273, Val Accuracy: 55.28199794682363%
Epoch 4/10, Train Loss: 1.4466348021021191, Val Loss: 1.4591693799350705, Val Accuracy: 55.650942085184404%
Epoch 5/10, Train Loss: 1.4250733407254097, Val Loss: 1.4415393042721696, Val Accuracy: 56.159753260202%
Epoch 6/10, Train Loss: 1.4086411533525351, Val Loss: 1.430341024885763, Val Accuracy: 56.49955843259261%
Epoch 7/10, Train Loss: 1.3963485210556807, Val Loss: 1.4218892815362019, Val Accuracy: 56.62508013215642%
Epoch 8/10, Train Loss: 1.386787021674003, Val Loss: 1.4167328518625655, Val Accuracy: 56.84429481460893%
Epoch 9/10, Train Loss: 1.379176461958266, Val Loss: 1.4164136524557451, Val Accuracy: 56.85505381742869%
Epoch 10/10, Trai