In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
os.chdir('/content/drive/MyDrive/')
print(os.listdir('ptb_data'))

['ptb.test.txt', 'ptb.valid.txt', 'ptb.train.txt']


#  1. Load and preprocess the dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from collections import Counter
import re
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

In [None]:
# File paths
train_path = 'ptb_data/ptb.train.txt'
valid_path = 'ptb_data/ptb.valid.txt'
test_path = 'ptb_data/ptb.test.txt'

In [None]:
def tokenize(text):
    """Tokenizes a given text into words."""
    # Use regular expressions to split by spaces, and normalize punctuation
    return re.findall(r'\w+|\S', text.lower())

def load_data(file_path):
    with open(file_path, 'r') as f:
        text = f.read()
    return tokenize(text)

train_data = load_data(train_path)
valid_data = load_data(valid_path)
test_data = load_data(test_path)

print(f"Train data size: {len(train_data)} tokens")
print(f"Validation data size: {len(valid_data)} tokens")
print(f"Test data size: {len(test_data)} tokens")


Train data size: 1027251 tokens
Validation data size: 81364 tokens
Test data size: 92844 tokens


In [None]:
class Vocab:
    def __init__(self, tokens):
        counter = Counter(tokens)
        self.token_to_idx = {'<unk>': 0}  # Add <unk> as the first token in the vocab
        # Start from index 1 because 0 is reserved for <unk>
        self.token_to_idx.update({token: idx + 1 for idx, (token, _) in enumerate(counter.most_common())})
        self.idx_to_token = {idx: token for token, idx in self.token_to_idx.items()}
        self.vocab_size = len(self.token_to_idx)

    def encode(self, tokens):
        return [self.token_to_idx.get(token, self.token_to_idx['<unk>']) for token in tokens]

    def decode(self, indices):
        return [self.idx_to_token.get(idx, '<unk>') for idx in indices]

# Rebuild the vocab with the fix
vocab = Vocab(train_data)

# Encode the datasets again
train_ids = vocab.encode(train_data)
valid_ids = vocab.encode(valid_data)
test_ids = vocab.encode(test_data)

print(f"Vocabulary size: {vocab.vocab_size}")


Vocabulary size: 9655


# 2. Create Dataset and Dataloader

In [None]:
class PTBDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.seq_length]
        y = self.data[idx+1:idx+self.seq_length+1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

# Hyperparameters
seq_length = 20
batch_size = 128

# Create datasets
train_dataset = PTBDataset(train_ids, seq_length)
valid_dataset = PTBDataset(valid_ids, seq_length)
test_dataset = PTBDataset(test_ids, seq_length)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Check batch shapes
for x_batch, y_batch in train_loader:
    print(f"Input batch shape: {x_batch.shape}")
    print(f"Target batch shape: {y_batch.shape}")
    break


Input batch shape: torch.Size([128, 20])
Target batch shape: torch.Size([128, 20])


# 3. Define the LSTM and GRU Models

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
class RNNModel(nn.Module):
    def __init__(self, rnn_type, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.rnn_type = rnn_type
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.dropout = nn.Dropout(dropout)

        # Define the RNN layer (either LSTM or GRU)
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(embed_size, hidden_size, num_layers, dropout=dropout, batch_first=True)

        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, vocab_size)

        # Initialize weights
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, x, hidden):
        x = self.embedding(x)
        x = self.dropout(x)
        output, hidden = self.rnn(x, hidden)
        output = self.dropout(output)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.rnn.num_layers, batch_size, self.rnn.hidden_size).to(device),
                    weight.new_zeros(self.rnn.num_layers, batch_size, self.rnn.hidden_size).to(device))
        else:
            return weight.new_zeros(self.rnn.num_layers, batch_size, self.rnn.hidden_size).to(device)

# Hyperparameters
vocab_size = vocab.vocab_size  # Size of the vocabulary
embed_size = 200               # Embedding size
hidden_size = 200              # Number of hidden units
num_layers = 2                 # Number of layers in LSTM/GRU
dropout = 0.5                  # Dropout probability

# Instantiate models
lstm_model = RNNModel('LSTM', vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)
gru_model = RNNModel('GRU', vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)


In [None]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=1e-3, weight_decay=1e-4)
gru_optimizer = optim.Adam(gru_model.parameters(), lr=1e-3, weight_decay=1e-4)

# 4.  Training and Validation Functions

In [None]:
def run_epoch(model, data_loader, criterion, hidden, optimizer=None, mode='train'):
    if mode == 'train':
        model.train()
    else:
        model.eval()

    total_loss = 0

    for batch_idx, (x_batch, y_batch) in enumerate(data_loader):
        # Log every 10 batches to track progress
        # if batch_idx % 500 == 0:
        #     print(f'Batch {batch_idx}/{len(data_loader)} ({mode} mode)...')

        # Move data to GPU
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        # Initialize the hidden state dynamically based on the actual batch size
        batch_size = x_batch.size(0)  # Get the current batch size
        hidden = model.init_hidden(batch_size)

        # Detach hidden state to avoid backpropagating through entire history
        if isinstance(hidden, tuple):  # For LSTM
            hidden = tuple(h.detach() for h in hidden)
        else:  # For GRU
            hidden = hidden.detach()

        if mode == 'train':
            optimizer.zero_grad()

        # Forward pass
        output, hidden = model(x_batch, hidden)

        # Reshape output and target to match dimensions
        output = output.view(-1, model.fc.out_features)
        y_batch = y_batch.view(-1)

        # Compute the loss
        loss = criterion(output, y_batch)
        total_loss += loss.item()

        if mode == 'train':
            # Backward pass and optimization
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()

    avg_loss = total_loss / len(data_loader)
    perplexity = math.exp(avg_loss)
    return perplexity

def train_model(model, optimizer, train_loader, valid_loader, criterion, num_epochs=20, learning_rate=1e-3, model_name="LSTM", adjust_lr=False):
    train_perplexities = []
    valid_perplexities = []
    model_dropout = model.dropout.p

    # Early stopping variables
    patience=3
    best_val_perplexity = float('inf')  # Initialize with a very large value
    epochs_without_improvement = 0      # Counter for early stopping

    for epoch in range(1, num_epochs + 1):
        print(f"Epoch {epoch}/{num_epochs}")
        hidden = model.init_hidden(batch_size)
        train_perplexity = run_epoch(model, train_loader, criterion, hidden, optimizer, mode='train')
        train_perplexities.append(train_perplexity)

        hidden = model.init_hidden(batch_size)
        valid_perplexity = run_epoch(model, valid_loader, criterion, hidden, mode='eval')
        valid_perplexities.append(valid_perplexity)

        print(f"Train Perplexity: {train_perplexity}, Validation Perplexity: {valid_perplexity}")

        # Check if validation perplexity has improved
        if valid_perplexity < best_val_perplexity:
            best_val_perplexity = valid_perplexity  # Update best validation perplexity
            epochs_without_improvement = 0         # Reset counter
            # Optionally, save the best model
            torch.save(model.state_dict(), f"{model_name}_dropout_{model_dropout}.pt")
        else:
            epochs_without_improvement += 1

        # Early stopping condition: if no improvement after 'patience' epochs
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered. No improvement in validation perplexity for {patience} epochs.")
            break

        if epoch > 10 and adjust_lr:
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate / 2

    torch.save(model.state_dict(), f"{model_name}_dropout_{model_dropout}.pt")
    plot_perplexity(train_perplexities, valid_perplexities, f"{model_name} Dropout {model_dropout}")

def plot_perplexity(train_perplexities, valid_perplexities, title):
    epochs = range(1, len(train_perplexities) + 1)
    plt.figure(figsize=(10,6))
    plt.plot(epochs, train_perplexities, label='Train Perplexity')
    plt.plot(epochs, valid_perplexities, label='Validation Perplexity')
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Perplexity')
    plt.legend()
    plt.grid(True)
    plt.show()


# 5. Training the Models

In [None]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=1e-3, weight_decay=1e-4)
gru_optimizer = optim.Adam(gru_model.parameters(), lr=1e-3, weight_decay=1e-4)

In [None]:
# Train GRU with dropout
gru_model = RNNModel('GRU', vocab_size, embed_size, hidden_size, num_layers, dropout=0.5).to(device)
train_model(gru_model, gru_optimizer, train_loader, valid_loader, criterion, num_epochs=20, learning_rate=1e-3, model_name="GRU", adjust_lr=False)


Epoch 1/20


KeyboardInterrupt: 

The title of the plot above was incorrectly labeled as 'Dropout False', but it should have been 'Dropout True', as I had set a dropout rate of 0.5. Since retraining the model would require significant time and GPU resources, I have corrected the plot below with the appropriate labeling.

In [None]:
import matplotlib.pyplot as plt

train_perplexities = [
    79.2038407732293, 71.23041632081303, 67.95972482490266, 65.89077405722423,
    64.39021007516926, 63.282079283223844, 62.349822438475165, 61.58643578918576,
    60.98221845850603, 60.42198838709921, 59.973448579326416, 59.59782594681733,
    59.24937170954137, 58.95355961403196, 58.67382698031848, 58.42978451852492,
    58.20298292592998, 58.01862000444544, 57.83689348959762, 57.65528775908054
]

valid_perplexities = [
    83.51898453606896, 80.6781195059235, 79.28002449351014, 78.71301607400568,
    78.23119245150805, 77.63007516100532, 76.96321206774431, 76.79797927372502,
    77.10666476949167, 76.7908415302071, 76.5967117595488, 76.45965577253581,
    76.39802953434297, 76.10237647133897, 76.07677330732102, 75.76233214819982,
    76.0703291660445, 75.97126476358328, 75.93605645844217, 75.92119399268563
]

epochs = range(1, 21)
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_perplexities, label='Train Perplexity')
plt.plot(epochs, valid_perplexities, label='Validation Perplexity')
plt.title('GRU with Dropout (0.5) Perplexity over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Perplexity')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Train LSTM with dropout
# lstm_model= RNNModel('LSTM', vocab_size, embed_size, hidden_size, num_layers, dropout=0.5).to(device)
train_model(lstm_model, lstm_optimizer, train_loader, valid_loader, criterion, num_epochs=20, learning_rate=1e-3, model_name="LSTM", adjust_lr=False)


Epoch 1/20
Train Perplexity: 168.1360345293799, Validation Perplexity: 122.49096319408415
Epoch 2/20
Train Perplexity: 124.57422855083269, Validation Perplexity: 115.4206457530447
Epoch 3/20
Train Perplexity: 119.28772611267013, Validation Perplexity: 111.81512979427684
Epoch 4/20
Train Perplexity: 116.62138439659785, Validation Perplexity: 110.17639853948272
Epoch 5/20
Train Perplexity: 115.06268699787479, Validation Perplexity: 109.38395062383222
Epoch 6/20
Train Perplexity: 113.83612405659997, Validation Perplexity: 107.97775254360951
Epoch 7/20
Train Perplexity: 112.88567135094235, Validation Perplexity: 107.2866375434657
Epoch 8/20
Train Perplexity: 112.26670621211291, Validation Perplexity: 107.04651771116298
Epoch 9/20
Train Perplexity: 111.92381935540799, Validation Perplexity: 107.28912952641741
Epoch 10/20
Train Perplexity: 111.60378786716478, Validation Perplexity: 107.23322071335828
Epoch 11/20


KeyboardInterrupt: 

Same problem here for the title of this plot.

In [None]:
import matplotlib.pyplot as plt

train_perplexities = [
    150.70332416177132, 88.24628654241678, 77.09482233234895, 71.67234172535304,
    68.39336558406458, 66.18962558774173, 64.5742895635338, 63.32189312216786,
    62.32098061717168, 61.53321605723979, 60.82929555249856, 60.243227527320045,
    59.713504605157965, 59.27169866239956, 58.85302214781619, 58.48169077611899,
    58.1492392204031, 57.84321812106282, 57.56612621685514, 57.3090238691717
]

valid_perplexities = [
    103.19198727059738, 94.02699475507802, 91.06219613293885, 90.4301103040572,
    90.14778112524569, 90.05302524516907, 89.71052606461775, 89.70515593230863,
    90.23652013207837, 90.22302758661404, 90.27797104561058, 90.12297744140753,
    90.78439104988898, 90.53231438997814, 90.73777568127538, 90.60409196894616,
    90.82609012563017, 90.79560485511654, 90.8163036636236, 91.26676175400254
]

epochs_new = range(1, 21)
plt.figure(figsize=(10, 6))
plt.plot(epochs_new, train_perplexities, label='Train Perplexity')
plt.plot(epochs_new, valid_perplexities, label='Validation Perplexity')
plt.title('LSTM with Dropout (0.5) Perplexity over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Perplexity')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Train LSTM without dropout
# lstm_model_without_dropout = RNNModel('LSTM', vocab_size, embed_size, hidden_size, num_layers, dropout=0).to(device)
train_model(lstm_model, lstm_optimizer, train_loader, valid_loader, criterion, num_epochs=20, learning_rate=1e-3, model_name="LSTM")


In [None]:
# Train GRU without dropout
train_model(gru_model, gru_optimizer, train_loader, valid_loader, criterion, num_epochs=20, learning_rate=1e-3, model_name="GRU")

# 6. Test with Saved Weights

### LSTM without dropout

In [None]:
# Recreate the same model architecture
model = RNNModel(rnn_type='LSTM', vocab_size=vocab_size, embed_size=200, hidden_size=200, num_layers=2, dropout=0)

# Load the saved model weights
model.load_state_dict(torch.load("LSTM_dropout_0.pt", map_location=torch.device('cpu')))

# Move the model to GPU if available
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Initialize hidden state for testing
hidden = model.init_hidden(batch_size)

# Run on the validation or test set
validation_perplexity = run_epoch(model, valid_loader, criterion, hidden, mode='eval')
print(f"LSTM_dropout_0 Validation Perplexity: {validation_perplexity}")

  model.load_state_dict(torch.load("LSTM_dropout_0.pt", map_location=torch.device('cpu')))


LSTM_dropout_0 Validation Perplexity: 89.20511978425607


In [None]:
def evaluate_test_perplexity(model, test_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    hidden = model.init_hidden(batch_size)  # Initialize hidden state
    test_perplexity = run_epoch(model, test_loader, criterion, hidden, mode='eval')
    print(f"LSTM dropout 0 Test Perplexity: {test_perplexity}")
    return test_perplexity

# Load the best model (if saved during training)
model = RNNModel('LSTM', vocab_size, embed_size, hidden_size, num_layers, dropout=0).to(device)
model.load_state_dict(torch.load("LSTM_dropout_0.pt", map_location=torch.device('cpu')))  # Adjust model name if needed
# Move the model to GPU if available
model = model.to(device)
# Evaluate on the test set
test_perplexity = evaluate_test_perplexity(model, test_loader, criterion)

  model.load_state_dict(torch.load("LSTM_dropout_0.pt", map_location=torch.device('cpu')))  # Adjust model name if needed


LSTM dropout 0 Test Perplexity: 78.05171712215294


### LSTM with dropout

In [None]:
# Recreate the same model architecture
model = RNNModel(rnn_type='LSTM', vocab_size=vocab_size, embed_size=200, hidden_size=200, num_layers=2, dropout=0.5)

# Load the saved model weights
model.load_state_dict(torch.load("LSTM_dropout_dot5.pt", map_location=torch.device('cpu')))

# Move the model to GPU if available
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Initialize hidden state for testing
hidden = model.init_hidden(batch_size)

# Run on the validation or test set
validation_perplexity = run_epoch(model, valid_loader, criterion, hidden, mode='eval')
print(f"LSTM_dropout_0.5 Validation Perplexity: {validation_perplexity}")



  model.load_state_dict(torch.load("LSTM_dropout_dot5.pt", map_location=torch.device('cpu')))


LSTM_dropout_0.5 Validation Perplexity: 91.26665208316972


In [None]:
def evaluate_test_perplexity(model, test_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    hidden = model.init_hidden(batch_size)  # Initialize hidden state
    test_perplexity = run_epoch(model, test_loader, criterion, hidden, mode='eval')
    print(f"LSTM dropout 0.5 Test Perplexity: {test_perplexity}")
    return test_perplexity

# Load the best model (if saved during training)
model = RNNModel('LSTM', vocab_size, embed_size, hidden_size, num_layers, dropout=0.5).to(device)
model.load_state_dict(torch.load("LSTM_dropout_dot5.pt", map_location=torch.device('cpu')))
# Move the model to GPU if available
model = model.to(device)
# Evaluate on the test set
test_perplexity = evaluate_test_perplexity(model, test_loader, criterion)


  model.load_state_dict(torch.load("LSTM_dropout_dot5.pt", map_location=torch.device('cpu')))  # Adjust model name if needed


Test Perplexity: 70.65373100074312


### GRU without dropout

In [None]:
# Recreate the same model architecture
model = RNNModel(rnn_type='GRU', vocab_size=vocab_size, embed_size=200, hidden_size=200, num_layers=2, dropout=0.0)

# Load the saved model weights
model.load_state_dict(torch.load("GRU_dropout_0.pt"))

# Move the model to GPU if available
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Initialize hidden state for testing
hidden = model.init_hidden(batch_size)

# Run on the validation or test set
validation_perplexity = run_epoch(model, valid_loader, criterion, hidden, mode='eval')
print(f"GRU_dropout_0 Validation Perplexity: {validation_perplexity}")

  model.load_state_dict(torch.load("GRU_dropout_0.pt"))


GRU_dropout_0 Validation Perplexity: 84.67818147256901


In [None]:
def evaluate_test_perplexity(model, test_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    hidden = model.init_hidden(batch_size)  # Initialize hidden state
    test_perplexity = run_epoch(model, test_loader, criterion, hidden, mode='eval')
    print(f"GRU dropout 0 Test Perplexity: {test_perplexity}")
    return test_perplexity

# Load the best model (if saved during training)
model = RNNModel('GRU', vocab_size, embed_size, hidden_size, num_layers, dropout=0).to(device)
model.load_state_dict(torch.load("GRU_dropout_0.pt", map_location=torch.device('cpu')))
# Move the model to GPU if available
model = model.to(device)
# Evaluate on the test set
test_perplexity = evaluate_test_perplexity(model, test_loader, criterion)

  model.load_state_dict(torch.load("GRU_dropout_0.pt", map_location=torch.device('cpu')))


GRU dropout 0 Test Perplexity: 73.53074413900691


### GRU with dropout

In [None]:
# Recreate the same model architecture
model = RNNModel(rnn_type='GRU', vocab_size=vocab_size, embed_size=200, hidden_size=200, num_layers=2, dropout=0.5)

# Load the saved model weights
model.load_state_dict(torch.load("GRU_dropout_dot5.pt"))

# Move the model to GPU if available
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Initialize hidden state for testing
hidden = model.init_hidden(batch_size)

# Run on the validation or test set
validation_perplexity = run_epoch(model, valid_loader, criterion, hidden, mode='eval')
print(f"GRU_dropout_0.5 Validation Perplexity: {validation_perplexity}")

  model.load_state_dict(torch.load("GRU_dropout_dot5.pt"))


GRU_dropout_0.5 Validation Perplexity: 75.92119399268563


In [None]:
def evaluate_test_perplexity(model, test_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    hidden = model.init_hidden(batch_size)  # Initialize hidden state
    test_perplexity = run_epoch(model, test_loader, criterion, hidden, mode='eval')
    print(f"GRU dropout 0.5 Test Perplexity: {test_perplexity}")
    return test_perplexity

# Load the best model (if saved during training)
model = RNNModel('GRU', vocab_size, embed_size, hidden_size, num_layers, dropout=0.5).to(device)
model.load_state_dict(torch.load("GRU_dropout_dot5.pt", map_location=torch.device('cpu')))
# Move the model to GPU if available
model = model.to(device)
# Evaluate on the test set
test_perplexity = evaluate_test_perplexity(model, test_loader, criterion)



  model.load_state_dict(torch.load("GRU_dropout_dot5.pt", map_location=torch.device('cpu')))  # Adjust model name if needed


GRU dropout 0.5 Test Perplexity: 63.965365740145465
