Every module/layer in PyTorch subclasses `nn.Module`. Here we build a model to classify the FashionMNIST dataset.

In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader 
from torchvision import datasets, transforms

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [7]:
class NeuralNet(nn.Module):
    # initialize the layers in __init__ 
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )
    
    # every nn.Module subclass implements the operations
    # on input data in the forward method
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [8]:
# create an instance of NeuralNet and move it to the device 
# and prints its structure
model = NeuralNet().to(device)
print(model)

NeuralNet(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [23]:
# X is one random image (28*28 pixels matrice)
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
print(f"logits:\n {logits}")
pred_prob = nn.Softmax(dim=1)(logits)
print(f"pred_prob:\n {pred_prob}")
y_pred = pred_prob.argmax(1)
print(f"predicted class:\n {y_pred}")

logits:
 tensor([[-0.0529, -0.0290, -0.0683, -0.0517, -0.0141, -0.0064, -0.0648,  0.0112,
         -0.0105, -0.0253]], grad_fn=<AddmmBackward0>)
pred_prob:
 tensor([[0.0978, 0.1002, 0.0963, 0.0979, 0.1017, 0.1025, 0.0967, 0.1043, 0.1021,
         0.1006]], grad_fn=<SoftmaxBackward0>)
predicted class:
 tensor([7])


In [29]:
# a sample minibatch of 3 images of size 28x28
input_image = torch.rand(3, 28, 28)
print(input_image.size())
input_image

torch.Size([3, 28, 28])


tensor([[[0.1698, 0.6401, 0.9141,  ..., 0.5754, 0.0915, 0.3493],
         [0.2870, 0.9838, 0.6591,  ..., 0.7330, 0.4444, 0.0334],
         [0.3381, 0.1555, 0.8735,  ..., 0.9459, 0.5256, 0.5829],
         ...,
         [0.1317, 0.4292, 0.0351,  ..., 0.3460, 0.9050, 0.5769],
         [0.9101, 0.5752, 0.7337,  ..., 0.1006, 0.1500, 0.6448],
         [0.5326, 0.7913, 0.3438,  ..., 0.8404, 0.7613, 0.5359]],

        [[0.7991, 0.4153, 0.7676,  ..., 0.5111, 0.0519, 0.6828],
         [0.1036, 0.1688, 0.8555,  ..., 0.5817, 0.2393, 0.4748],
         [0.2353, 0.4067, 0.3573,  ..., 0.3784, 0.8680, 0.7492],
         ...,
         [0.1080, 0.8656, 0.9572,  ..., 0.3794, 0.8536, 0.7911],
         [0.2099, 0.7409, 0.4736,  ..., 0.2020, 0.6695, 0.4065],
         [0.6189, 0.4603, 0.9020,  ..., 0.8841, 0.6310, 0.5967]],

        [[0.6085, 0.8624, 0.3412,  ..., 0.8852, 0.8280, 0.1239],
         [0.6048, 0.4830, 0.4760,  ..., 0.0383, 0.9964, 0.0899],
         [0.1420, 0.7001, 0.5563,  ..., 0.2758, 0.4613, 0.

In [32]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())
flat_image

torch.Size([3, 784])


tensor([[0.1698, 0.6401, 0.9141,  ..., 0.8404, 0.7613, 0.5359],
        [0.7991, 0.4153, 0.7676,  ..., 0.8841, 0.6310, 0.5967],
        [0.6085, 0.8624, 0.3412,  ..., 0.0328, 0.1407, 0.5785]])

In [36]:
# the linear layer is a module that applies a linear transformation 
# on the input using its stored weights and biases
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())
hidden1

torch.Size([3, 20])


tensor([[-0.0618,  0.4944,  0.0778,  0.5778,  0.5266, -0.0043, -0.1944, -0.0483,
         -0.3917, -0.1207, -0.0785,  0.4999, -0.0391,  0.2558,  0.2724,  0.2816,
          0.0992, -0.3392,  0.3312, -0.1209],
        [ 0.1175,  0.8568,  0.2291,  0.5376,  0.6717, -0.2213, -0.1995, -0.1520,
         -0.4277, -0.0190, -0.1352,  0.6526, -0.1882,  0.2640,  0.2507,  0.0662,
         -0.0216, -0.2648,  0.3321,  0.2048],
        [-0.1206,  0.5289,  0.2122,  0.5381,  0.6254, -0.0936, -0.3246, -0.4594,
         -0.4243,  0.0403, -0.2283,  0.9413, -0.0146,  0.1529,  0.2474,  0.4443,
          0.0868, -0.1364,  0.0500, -0.1638]], grad_fn=<AddmmBackward0>)

Non-linear activations are what create the complex mappings between the model’s inputs and outputs. They are applied after linear transformations to introduce nonlinearity, helping neural networks learn a wide variety of phenomena.

In [37]:
print(f"before ReLU:\n {hidden1}")
hidden1 = nn.ReLU()(hidden1)
print(f"after ReLU:\n {hidden1}")

before ReLU:
 tensor([[-0.0618,  0.4944,  0.0778,  0.5778,  0.5266, -0.0043, -0.1944, -0.0483,
         -0.3917, -0.1207, -0.0785,  0.4999, -0.0391,  0.2558,  0.2724,  0.2816,
          0.0992, -0.3392,  0.3312, -0.1209],
        [ 0.1175,  0.8568,  0.2291,  0.5376,  0.6717, -0.2213, -0.1995, -0.1520,
         -0.4277, -0.0190, -0.1352,  0.6526, -0.1882,  0.2640,  0.2507,  0.0662,
         -0.0216, -0.2648,  0.3321,  0.2048],
        [-0.1206,  0.5289,  0.2122,  0.5381,  0.6254, -0.0936, -0.3246, -0.4594,
         -0.4243,  0.0403, -0.2283,  0.9413, -0.0146,  0.1529,  0.2474,  0.4443,
          0.0868, -0.1364,  0.0500, -0.1638]], grad_fn=<AddmmBackward0>)
after ReLU:
 tensor([[0.0000, 0.4944, 0.0778, 0.5778, 0.5266, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.4999, 0.0000, 0.2558, 0.2724, 0.2816, 0.0992, 0.0000,
         0.3312, 0.0000],
        [0.1175, 0.8568, 0.2291, 0.5376, 0.6717, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.6526, 0.0000, 0.2640, 0.25

`nn.Sequential` is an ordered container of modules. The data is passed through all the modules in the same order as defined. You can use sequential containers to put together a quick network like `seq_modules`.

In [38]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3, 28, 28)
logits = seq_modules(input_image)
logits

tensor([[-0.1099, -0.0872, -0.2717,  0.0881,  0.0162, -0.5116,  0.1605, -0.2554,
          0.0309,  0.5082],
        [-0.1598,  0.0551, -0.1110,  0.1366,  0.0703, -0.6088,  0.2912, -0.1996,
          0.1731,  0.4252],
        [-0.1646, -0.0639, -0.1984,  0.1282,  0.2406, -0.6302,  0.3010, -0.2357,
          0.0694,  0.4131]], grad_fn=<AddmmBackward0>)

The last linear layer of the neural network returns logits - raw values in [-infty, infty] - which are passed to the `nn.Softmax` module. The logits are scaled to values [0, 1] representing the model’s predicted probabilities for each class. `dim` parameter indicates the dimension along which the values must sum to 1.

In [42]:
softmax = nn.Softmax(dim=1)
pred_prob = softmax(logits)
pred_prob

tensor([[0.0903, 0.0923, 0.0768, 0.1100, 0.1024, 0.0604, 0.1183, 0.0780, 0.1039,
         0.1675],
        [0.0816, 0.1012, 0.0857, 0.1098, 0.1027, 0.0521, 0.1281, 0.0784, 0.1139,
         0.1465],
        [0.0826, 0.0913, 0.0798, 0.1107, 0.1238, 0.0518, 0.1315, 0.0769, 0.1043,
         0.1471]], grad_fn=<SoftmaxBackward0>)

In [43]:
# accessing model parameters
print(f"Model structure:\n {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values:\n {param[:2]}\n")

Model structure:
 NeuralNet(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values:
 tensor([[ 0.0055,  0.0169,  0.0306,  ..., -0.0017, -0.0210,  0.0297],
        [ 0.0155,  0.0314, -0.0341,  ..., -0.0245,  0.0039, -0.0274]],
       grad_fn=<SliceBackward0>)

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values:
 tensor([0.0087, 0.0126], grad_fn=<SliceBackward0>)

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values:
 tensor([[-0.0191,  0.0140, -0.0292,  ..., -0.0093,  0.0428,  0.0365],
        [-0.0132, -0.0311, -0.0419,  ...,  0.0119,  0.0276, -0.0148]],
       grad_fn=<SliceBackward0>)

Layer: linear_relu_stack.2.bias | Size: to