In [1]:
import streamlit as st
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt

# -----------------------------
# Streamlit UI
# -----------------------------
st.title("Module 3: Training Deep Neural Networks (ANN)")

# Hyperparameters
batch_size = st.slider("Batch Size", 8, 128, 32)
learning_rate = st.slider("Learning Rate", 0.0001, 0.1, 0.01, step=0.001)
epochs = st.slider("Epochs", 1, 20, 5)
optimizer_choice = st.selectbox("Optimizer", ["SGD", "Adam"])
dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2, step=0.05)
l2_reg = st.slider("L2 Regularization (Weight Decay)", 0.0, 0.1, 0.0, step=0.01)
momentum = st.slider("Momentum (SGD only)", 0.0, 0.99, 0.9, step=0.01)

# -----------------------------
# Load MNIST Dataset
# -----------------------------
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
dataset = datasets.MNIST(root="./data", train=True, download=True, transform=transform)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# -----------------------------
# Define Model with Weight Initialization and Dropout
# -----------------------------
class SimpleANN(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(SimpleANN, self).__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
        # Weight initialization
        nn.init.kaiming_normal_(self.fc1.weight)
        nn.init.kaiming_normal_(self.fc2.weight)
        nn.init.xavier_normal_(self.fc3.weight)

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = SimpleANN(dropout_rate=dropout_rate)

# -----------------------------
# Define Optimizer
# -----------------------------
if optimizer_choice == "SGD":
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=l2_reg)
elif optimizer_choice == "Adam":
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_reg)

criterion = nn.CrossEntropyLoss()

# -----------------------------
# Training Loop with Live Plot
# -----------------------------
train_losses, val_losses, val_accuracies = [], [], []

if st.button("Start Training"):
    st.write("Training started...")
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            # Gradient Clipping to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            running_loss += loss.item() * images.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_loss)

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        with torch.no_grad():
            for images, labels in val_loader:
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)
        val_acc = correct / len(val_loader.dataset)
        val_accuracies.append(val_acc)

        # Live Plot
        st.write(f"Epoch {epoch}/{epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {val_loss:.4f} - Val Acc: {val_acc:.4f}")
        fig, ax = plt.subplots(1, 2, figsize=(10,4))
        ax[0].plot(range(1,len(train_losses)+1), train_losses, label="Train Loss")
        ax[0].plot(range(1,len(val_losses)+1), val_losses, label="Val Loss")
        ax[0].set_xlabel("Epochs"); ax[0].set_ylabel("Loss"); ax[0].legend(); ax[0].grid(True)
        ax[1].plot(range(1,len(val_accuracies)+1), val_accuracies, label="Val Accuracy", color="green")
        ax[1].set_xlabel("Epochs"); ax[1].set_ylabel("Accuracy"); ax[1].legend(); ax[1].grid(True)
        st.pyplot(fig)

    st.success("Training Completed!")

# -----------------------------
# Optional: Save Model Button
# -----------------------------
if st.button("Save Trained Model"):
    torch.save(model.state_dict(), "mnist_ann_trained.pth")
    st.write("Model saved as mnist_ann_trained.pth")

ModuleNotFoundError: No module named 'streamlit'

Here’s a **full Python + PyTorch Streamlit demo** for **Module 3: Training Deep Neural Networks** covering **batch/mini-batch, learning rate scheduling, weight initialization, regularization, gradient issues, and live training visualization**.

It’s interactive: learners can **train a small ANN on MNIST**, see **live-updating loss/accuracy**, and experiment with **regularization and optimizers**.

```python
import streamlit as st
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt

# -----------------------------
# Streamlit UI
# -----------------------------
st.title("Module 3: Training Deep Neural Networks (ANN)")

# Hyperparameters
batch_size = st.slider("Batch Size", 8, 128, 32)
learning_rate = st.slider("Learning Rate", 0.0001, 0.1, 0.01, step=0.001)
epochs = st.slider("Epochs", 1, 20, 5)
optimizer_choice = st.selectbox("Optimizer", ["SGD", "Adam"])
dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2, step=0.05)
l2_reg = st.slider("L2 Regularization (Weight Decay)", 0.0, 0.1, 0.0, step=0.01)
momentum = st.slider("Momentum (SGD only)", 0.0, 0.99, 0.9, step=0.01)

# -----------------------------
# Load MNIST Dataset
# -----------------------------
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
dataset = datasets.MNIST(root="./data", train=True, download=True, transform=transform)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# -----------------------------
# Define Model with Weight Initialization and Dropout
# -----------------------------
class SimpleANN(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(SimpleANN, self).__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
        # Weight initialization
        nn.init.kaiming_normal_(self.fc1.weight)
        nn.init.kaiming_normal_(self.fc2.weight)
        nn.init.xavier_normal_(self.fc3.weight)

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = SimpleANN(dropout_rate=dropout_rate)

# -----------------------------
# Define Optimizer
# -----------------------------
if optimizer_choice == "SGD":
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=l2_reg)
elif optimizer_choice == "Adam":
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_reg)

criterion = nn.CrossEntropyLoss()

# -----------------------------
# Training Loop with Live Plot
# -----------------------------
train_losses, val_losses, val_accuracies = [], [], []

if st.button("Start Training"):
    st.write("Training started...")
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            # Gradient Clipping to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            running_loss += loss.item() * images.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_loss)

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        with torch.no_grad():
            for images, labels in val_loader:
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)
        val_acc = correct / len(val_loader.dataset)
        val_accuracies.append(val_acc)

        # Live Plot
        st.write(f"Epoch {epoch}/{epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {val_loss:.4f} - Val Acc: {val_acc:.4f}")
        fig, ax = plt.subplots(1, 2, figsize=(10,4))
        ax[0].plot(range(1,len(train_losses)+1), train_losses, label="Train Loss")
        ax[0].plot(range(1,len(val_losses)+1), val_losses, label="Val Loss")
        ax[0].set_xlabel("Epochs"); ax[0].set_ylabel("Loss"); ax[0].legend(); ax[0].grid(True)
        ax[1].plot(range(1,len(val_accuracies)+1), val_accuracies, label="Val Accuracy", color="green")
        ax[1].set_xlabel("Epochs"); ax[1].set_ylabel("Accuracy"); ax[1].legend(); ax[1].grid(True)
        st.pyplot(fig)

    st.success("Training Completed!")

# -----------------------------
# Optional: Save Model Button
# -----------------------------
if st.button("Save Trained Model"):
    torch.save(model.state_dict(), "mnist_ann_trained.pth")
    st.write("Model saved as mnist_ann_trained.pth")
```

---

### **Features Implemented**

1. Batch and mini-batch training.
2. Adjustable learning rate and optimizer (SGD/Adam).
3. Weight initialization (Kaiming/Xavier).
4. Regularization: L2, Dropout.
5. Gradient clipping to handle exploding gradients.
6. Live training visualization: loss curves & accuracy plots.
7. Save trained model.

---

If you want, I can **merge this with the previous MNIST drawing/prediction app**, so learners can **train the network live and test by drawing digits interactively** in one Streamlit interface.

Do you want me to do that next?
