# 1. Library

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

# Dataset loading

In [None]:
# Use torch.utils.data.DataLoader to load datasets. MNIST is a built-in dataset in PyTorch.
# 1. Define the transformation: convert from pil + normalization
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
# 2. Download and create Dataset objects
full_train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
# 3. Split the training set into train/validation/test by 8:1:1
train_size = int(0.8 * len(full_train_dataset))  
val_size = int(0.1 * len(full_train_dataset))   
test_from_train_size = len(full_train_dataset) - train_size - val_size 
train_dataset, val_dataset, test_from_train = random_split(
    full_train_dataset, [train_size, val_size, test_from_train_size]
)

# 4. Create DataLoaders for batching and shuffling
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_from_train, batch_size=1000, shuffle=False)


# 3. Define your network

---
---

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        # Two hidden layers + output layer (128, 128, 10)
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# 4. Train your network

In [None]:
# Initilize all the hyperparameters you use
epochs = 10  # Number of times the entire dataset passes through the network
batch_size = 64 
learning_rate = 0.001 
input_size = 784
output_size = 10

In [None]:
# Write the training process
# Initialize model, loss, and optimizer (run the hyperparameters cell first)
import torch
import torch.nn as nn
import torch.optim as optim

model = MyNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for epoch in range(epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    for images, labels in train_loader:
        # 1. Move data to the same device as the model
        images, labels = images.to(device), labels.to(device)
        # 2. Clear previous gradients
        optimizer.zero_grad()
        # 3. Forward pass: compute predictions
        outputs = model(images)
        # 4. Calculate Loss
        loss = criterion(outputs, labels)
        # 5. Backward pass: compute gradients
        loss.backward()
        # 6. Update weights
        optimizer.step()
        running_loss += loss.item()
    # Print statistics every epoch
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")
    

print("Training Complete!")