# Build the Neural Network
This notebook follows the step from the [Build the Neural Network](https://docs.pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html) of the pyTorch offical documentation.

## Imports

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

## Get Device for Training

In [2]:
# device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
device = "cpu"  # ROCm does not work with my GPU
print(f"Using {device} device")

Using cpu device


## Define the Class

In [3]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [4]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [5]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([2])


## Model Layers

In [6]:
input_image = torch.rand(3,28,28)
print(input_image.size())

torch.Size([3, 28, 28])


### nn.Flatten()

In [7]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


In [8]:
print(nn.Flatten.__doc__)


    Flattens a contiguous range of dims into a tensor.

    For use with :class:`~nn.Sequential`, see :meth:`torch.flatten` for details.

    Shape:
        - Input: :math:`(*, S_{\text{start}},..., S_{i}, ..., S_{\text{end}}, *)`,'
          where :math:`S_{i}` is the size at dimension :math:`i` and :math:`*` means any
          number of dimensions including none.
        - Output: :math:`(*, \prod_{i=\text{start}}^{\text{end}} S_{i}, *)`.

    Args:
        start_dim: first dim to flatten (default = 1).
        end_dim: last dim to flatten (default = -1).

    Examples::
        >>> input = torch.randn(32, 1, 5, 5)
        >>> # With default parameters
        >>> m = nn.Flatten()
        >>> output = m(input)
        >>> output.size()
        torch.Size([32, 25])
        >>> # With non-default parameters
        >>> m = nn.Flatten(0, 2)
        >>> output = m(input)
        >>> output.size()
        torch.Size([160, 5])
    


In [9]:
nn.Flatten(0)(input_image).shape

torch.Size([2352])

In [10]:
nn.Flatten(1)(input_image).shape

torch.Size([3, 784])

In [11]:
nn.Flatten(2)(input_image).shape

torch.Size([3, 28, 28])

### nn.Linear()

In [12]:
print(nn.Linear.__doc__)

Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(*, H_\text{in})` where :math:`*` means any number of
          dimensions including none and :math:`H_\text{in} = \text{in\_features}`.
        - Output: :math:`(*, H_\text{out})` where all but the last dimension
          are the same shape as the input and :math:`H_\text{out} = \text{out\_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :math:`(\text{out\_features}, \text{in\_features})`. The values are
     

In [22]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


In [23]:
layer1.weight

Parameter containing:
tensor([[ 0.0155,  0.0231,  0.0092,  ...,  0.0034,  0.0149, -0.0177],
        [ 0.0254,  0.0017, -0.0072,  ..., -0.0033,  0.0132, -0.0324],
        [ 0.0349, -0.0247,  0.0112,  ..., -0.0285,  0.0160, -0.0226],
        ...,
        [ 0.0002,  0.0092,  0.0020,  ..., -0.0093,  0.0025,  0.0267],
        [-0.0142, -0.0116,  0.0126,  ...,  0.0153, -0.0225, -0.0186],
        [ 0.0051,  0.0236, -0.0126,  ..., -0.0268, -0.0228, -0.0330]],
       requires_grad=True)

In [24]:
layer = nn.Linear(in_features=10,  out_features=10)

x = torch.rand(10)
y = layer(x)

y

tensor([-0.2747,  0.1350, -0.8865, -0.1894, -0.3170,  0.0018, -0.1592,  0.7772,
         0.3894,  0.3154], grad_fn=<ViewBackward0>)

In [25]:
layer.weight

Parameter containing:
tensor([[-0.2960, -0.0574, -0.1726,  0.0907,  0.2353, -0.3092,  0.1125, -0.0738,
          0.0952, -0.1898],
        [ 0.2573,  0.0363,  0.1349, -0.2025,  0.2598, -0.0496,  0.2254,  0.0152,
          0.2921,  0.0238],
        [ 0.0209, -0.1672, -0.3038, -0.2100, -0.1875, -0.2619, -0.2993, -0.0018,
          0.1180,  0.0061],
        [ 0.2653, -0.2538, -0.1997,  0.0977,  0.0902, -0.0793, -0.1537, -0.0416,
         -0.2008,  0.0539],
        [-0.0811, -0.1934, -0.1184,  0.2595, -0.0915,  0.0333,  0.0893,  0.1553,
         -0.1087, -0.0891],
        [ 0.1283, -0.1995, -0.1355,  0.2478,  0.3001, -0.2241,  0.1366, -0.2366,
         -0.0186,  0.1866],
        [ 0.0400, -0.2314,  0.3074, -0.2242, -0.1401, -0.1783, -0.2405, -0.1507,
          0.1855,  0.0565],
        [ 0.0750,  0.1534, -0.0423,  0.0825,  0.1479,  0.0587, -0.2124,  0.3001,
          0.0501,  0.1326],
        [-0.1835,  0.0153, -0.1971,  0.0915,  0.1285,  0.1167, -0.1436,  0.1649,
          0.1573, -0.1137

## nn.ReLU()

In [26]:
print(nn.ReLU.__doc__)

Applies the rectified linear unit function element-wise.

    :math:`\text{ReLU}(x) = (x)^+ = \max(0, x)`

    Args:
        inplace: can optionally do the operation in-place. Default: ``False``

    Shape:
        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
        - Output: :math:`(*)`, same shape as the input.

    .. image:: ../scripts/activation_images/ReLU.png

    Examples::

        >>> m = nn.ReLU()
        >>> input = torch.randn(2)
        >>> output = m(input)


      An implementation of CReLU - https://arxiv.org/abs/1603.05201

        >>> m = nn.ReLU()
        >>> input = torch.randn(2).unsqueeze(0)
        >>> output = torch.cat((m(input), m(-input)))
    


In [36]:
nn.ReLU()(hidden1).shape

torch.Size([3, 20])

In [35]:
hidden1.shape

torch.Size([3, 20])

## nn.Sequential()

In [38]:
print(nn.Sequential.__doc__)

A sequential container.

    Modules will be added to it in the order they are passed in the
    constructor. Alternatively, an ``OrderedDict`` of modules can be
    passed in. The ``forward()`` method of ``Sequential`` accepts any
    input and forwards it to the first module it contains. It then
    "chains" outputs to inputs sequentially for each subsequent module,
    finally returning the output of the last module.

    The value a ``Sequential`` provides over manually calling a sequence
    of modules is that it allows treating the whole container as a
    single module, such that performing a transformation on the
    ``Sequential`` applies to each of the modules it stores (which are
    each a registered submodule of the ``Sequential``).

    What's the difference between a ``Sequential`` and a
    :class:`torch.nn.ModuleList`? A ``ModuleList`` is exactly what it
    sounds like--a list for storing ``Module`` s! On the other hand,
    the layers in a ``Sequential`` are connecte

In [40]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)
logits

tensor([[ 0.0067, -0.1063, -0.0076, -0.1179,  0.0656,  0.0976, -0.0157, -0.1423,
          0.0229, -0.0053],
        [ 0.0334, -0.0901,  0.0608, -0.2005,  0.0832,  0.0651, -0.0172, -0.1291,
         -0.0088, -0.0482],
        [-0.0295, -0.1323,  0.0401, -0.1519,  0.1368,  0.1490,  0.0165, -0.0654,
          0.0090, -0.0015]], grad_fn=<AddmmBackward0>)

In [41]:
logits.shape

torch.Size([3, 10])

## nn.Softmax()

In [42]:
print(nn.Softmax.__doc__)

Applies the Softmax function to an n-dimensional input Tensor.

    Rescales them so that the elements of the n-dimensional output Tensor
    lie in the range [0,1] and sum to 1.

    Softmax is defined as:

    .. math::
        \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}

    When the input Tensor is a sparse tensor then the unspecified
    values are treated as ``-inf``.

    Shape:
        - Input: :math:`(*)` where `*` means, any number of additional
          dimensions
        - Output: :math:`(*)`, same shape as the input

    Returns:
        a Tensor of the same dimension and shape as the input with
        values in the range [0, 1]

    Args:
        dim (int): A dimension along which Softmax will be computed (so every slice
            along dim will sum to 1).

    .. note::
        This module doesn't work directly with NLLLoss,
        which expects the Log to be computed between the Softmax and itself.
        Use `LogSoftmax` instead (it's faster and ha

In [47]:
nn.Softmax()(logits)

  return self._call_impl(*args, **kwargs)


tensor([[0.1024, 0.0915, 0.1010, 0.0904, 0.1087, 0.1122, 0.1002, 0.0883, 0.1041,
         0.1012],
        [0.1056, 0.0934, 0.1086, 0.0836, 0.1110, 0.1090, 0.1004, 0.0898, 0.1013,
         0.0973],
        [0.0970, 0.0875, 0.1039, 0.0858, 0.1145, 0.1159, 0.1015, 0.0935, 0.1007,
         0.0997]], grad_fn=<SoftmaxBackward0>)

In [46]:
nn.Softmax()(logits).shape

  return self._call_impl(*args, **kwargs)


torch.Size([3, 10])

## Model Parameters

Many layers inside a neural network are parameterized, i.e. have associated weights and biases that are optimized during training. Subclassing nn.Module automatically tracks all fields defined inside your model object, and makes all parameters accessible using your model’s parameters() or named_parameters() methods.

In this example, we iterate over each parameter, and print its size and a preview of its values.

In [48]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[-0.0356, -0.0218,  0.0281,  ...,  0.0111, -0.0255,  0.0221],
        [-0.0320, -0.0282,  0.0293,  ..., -0.0338, -0.0199, -0.0233]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([ 0.0343, -0.0066], grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[ 0.0011, -0.0437, -0.0245,  ...,  0.0176, -0.0318,  0.0113],
        [ 0.0069,  0.0055,  0.0164,  ...,  0.0265,  0.0180,  0.0083]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.bias | 