<a href="https://colab.research.google.com/github/sesmael/Real-Time-ML-/blob/main/Homework5_problem_2_partC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import requests
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# ---------------------------
# Download and Process the Tiny Shakespeare Dataset
# ---------------------------
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text  # The entire text data

def process_data(sequence_length):
    # Create a character mapping to integers
    chars = sorted(list(set(text)))
    char_to_int = {ch: i for i, ch in enumerate(chars)}
    int_to_char = {i: ch for i, ch in enumerate(chars)}

    # Encode the text into integers
    encoded_text = [char_to_int[ch] for ch in text]

    # Create sequences and targets
    sequences = []
    targets = []
    for i in range(len(encoded_text) - sequence_length):
        seq = encoded_text[i:i + sequence_length]
        target = encoded_text[i + sequence_length]
        sequences.append(seq)
        targets.append(target)
    # Convert lists to PyTorch tensors
    return torch.tensor(sequences, dtype=torch.long), torch.tensor(targets, dtype=torch.long), char_to_int, int_to_char

# ---------------------------
# Define the Dataset Class
# ---------------------------
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

In [None]:
# ---------------------------
# Define the Transformer Model for Next Character Prediction
# ---------------------------
class TransformerCharModel(nn.Module):
    def __init__(self, vocab_size, seq_len, embed_dim=128, num_layers=2, num_heads=2, hidden_dim=256, dropout=0.1):
        """
        vocab_size: Number of unique characters
        seq_len: Input sequence length
        embed_dim: Dimension of character embeddings
        num_layers: Number of transformer encoder layers
        num_heads: Number of attention heads
        hidden_dim: Hidden dimension of the feed-forward network
        dropout: Dropout probability
        """
        super(TransformerCharModel, self).__init__()
        self.seq_len = seq_len
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # Learned positional embeddings
        self.pos_embedding = nn.Parameter(torch.zeros(1, seq_len, embed_dim))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        # x: (batch_size, seq_len)
        x = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        x = x + self.pos_embedding  # add positional embeddings
        x = self.transformer_encoder(x)  # (batch_size, seq_len, embed_dim)
        # Use the output of the last time step for prediction
        out = self.fc_out(x[:, -1, :])
        return out

# ---------------------------
# Define an RNN Baseline (LSTM) for Comparison
# ---------------------------
class LSTMCharModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=1, dropout=0.0):
        super(LSTMCharModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc_out(out[:, -1, :])
        return out

# ---------------------------
# Training and Evaluation Functions
# ---------------------------
def train_model(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0
    for batch_X, batch_y in dataloader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

def evaluate_model(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X, batch_y in dataloader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            predictions = torch.argmax(outputs, dim=1)
            correct += (predictions == batch_y).sum().item()
            total += batch_y.size(0)
    return correct / total

# ---------------------------
# Main Experiment Loop (Only for Sequence Length = 50)
# ---------------------------
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # We only run the experiment for sequence length 50
    seq_len = 50
    print(f"\n=== Sequence Length: {seq_len} ===")

    # Process data for sequence length 50
    sequences, targets, char_to_int, int_to_char = process_data(seq_len)
    dataset = CharDataset(sequences, targets)

    # Simple train-validation split (80/20)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    batch_size = 64
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    vocab_size = len(char_to_int)

    # We create a dictionary to store the experiment results
    experiment_results = {}

    # Hyperparameter grid for transformers
    layer_options = [1, 2, 4]
    head_options = [2, 4]

    num_epochs = 10
    learning_rate = 1e-3

    # Loop over transformer hyperparameters combinations
    for num_layers in layer_options:
        for num_heads in head_options:
            config_name = f"Transformer_layers{num_layers}_heads{num_heads}"
            print(f"\n--- {config_name} ---")
            model = TransformerCharModel(
                vocab_size,
                seq_len=seq_len,
                embed_dim=128,
                num_layers=num_layers,
                num_heads=num_heads,
                hidden_dim=256,
                dropout=0.1
            ).to(device)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)

            start_time = time.time()
            for epoch in range(num_epochs):
                train_loss = train_model(model, train_loader, criterion, optimizer, device)
                val_acc = evaluate_model(model, val_loader, device)
                print(f"Epoch {epoch+1}/{num_epochs}: Train Loss={train_loss:.4f}, Val Acc={val_acc:.4f}")
            elapsed_time = time.time() - start_time
            final_val_acc = evaluate_model(model, val_loader, device)

            # Save results for this configuration
            experiment_results[config_name] = {
                "Final_Train_Loss": train_loss,
                "Final_Val_Accuracy": final_val_acc,
                "Training_Time_sec": elapsed_time,
                "Model_Size": sum(p.numel() for p in model.parameters())
            }

    # Train and evaluate the RNN baseline (for comparison)
    print("\n--- RNN Baseline (LSTM) ---")
    rnn_model = LSTMCharModel(
        vocab_size,
        embed_dim=128,
        hidden_dim=256,
        num_layers=1,
        dropout=0.0
    ).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(rnn_model.parameters(), lr=learning_rate)

    start_time = time.time()
    for epoch in range(num_epochs):
        train_loss = train_model(rnn_model, train_loader, criterion, optimizer, device)
        val_acc = evaluate_model(rnn_model, val_loader, device)
        print(f"Epoch {epoch+1}/{num_epochs}: Train Loss={train_loss:.4f}, Val Acc={val_acc:.4f}")
    elapsed_time = time.time() - start_time
    final_val_acc = evaluate_model(rnn_model, val_loader, device)

    experiment_results["RNN_Baseline"] = {
        "Final_Train_Loss": train_loss,
        "Final_Val_Accuracy": final_val_acc,
        "Training_Time_sec": elapsed_time,
        "Model_Size": sum(p.numel() for p in rnn_model.parameters())
    }

    print("\n\n--- Experiment Results ---")
    for config, metrics in experiment_results.items():
        print(f"{config}: Train Loss = {metrics['Final_Train_Loss']:.4f}, "
              f"Val Acc = {metrics['Final_Val_Accuracy']:.4f}, "
              f"Time = {metrics['Training_Time_sec']:.2f} sec, "
              f"Model Size = {metrics['Model_Size']} parameters")

if __name__ == '__main__':
    main()

Using device: cuda

=== Sequence Length: 50 ===

--- Transformer_layers1_heads2 ---


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch 1/10: Train Loss=2.0809, Val Acc=0.4488
Epoch 2/10: Train Loss=1.9106, Val Acc=0.4639
Epoch 3/10: Train Loss=1.8796, Val Acc=0.4657
Epoch 4/10: Train Loss=1.8644, Val Acc=0.4737
Epoch 5/10: Train Loss=1.8555, Val Acc=0.4751
Epoch 6/10: Train Loss=1.8518, Val Acc=0.4777
Epoch 7/10: Train Loss=1.8496, Val Acc=0.4730
Epoch 8/10: Train Loss=1.8426, Val Acc=0.4754
Epoch 9/10: Train Loss=1.8386, Val Acc=0.4798
Epoch 10/10: Train Loss=1.8358, Val Acc=0.4790

--- Transformer_layers1_heads4 ---
Epoch 1/10: Train Loss=2.0398, Val Acc=0.4643
Epoch 2/10: Train Loss=1.8471, Val Acc=0.4822
Epoch 3/10: Train Loss=1.8078, Val Acc=0.4866
Epoch 4/10: Train Loss=1.7887, Val Acc=0.4908
Epoch 5/10: Train Loss=1.7759, Val Acc=0.4953
Epoch 6/10: Train Loss=1.7675, Val Acc=0.4958
Epoch 7/10: Train Loss=1.7609, Val Acc=0.4973
Epoch 8/10: Train Loss=1.7575, Val Acc=0.4984
Epoch 9/10: Train Loss=1.7539, Val Acc=0.4983
Epoch 10/10: Train Loss=1.7549, Val Acc=0.4980

--- Transformer_layers2_heads2 ---
Epoch 