In [None]:
import torch

# Module & Parameters

Module and Parameter are how we define a neural network model in Pytorch. They are given by library `torch.nn.Module` and `torch.nn.Parameter`. 

Define a tiny model with 2 linear layers,
- 1st Linear Layer : Input size = 200, Output size = 100, activation function = ReLU
- 2nd Linear Layer : Input size = 200, Output size = 10, activation function = softmax

In [None]:
# Define the tiny model
class TinyModel(torch.nn.Module):
    def __init__(self):
        super(TinyModel, self).__init__()

        self.linear1 = torch.nn.Linear(in_features=100, out_features=200)
        self.activation = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(in_features=200, out_features=10)
        self.softmax = torch.nn.Softmax()

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.softmax(x)
        return x

In [None]:
tinymodel = TinyModel()

print(tinymodel)
print(tinymodel.linear2)
print('\n\nLayer params:')
for param in tinymodel.linear2.parameters():
    print(param)

TinyModel(
  (linear1): Linear(in_features=100, out_features=200, bias=True)
  (activation): ReLU()
  (linear2): Linear(in_features=200, out_features=10, bias=True)
  (softmax): Softmax(dim=None)
)
Linear(in_features=200, out_features=10, bias=True)


Layer params:
Parameter containing:
tensor([[ 0.0671, -0.0373,  0.0059,  ...,  0.0362,  0.0368, -0.0116],
        [-0.0673, -0.0417,  0.0070,  ..., -0.0311,  0.0361,  0.0162],
        [ 0.0387,  0.0154, -0.0532,  ..., -0.0027, -0.0343,  0.0588],
        ...,
        [ 0.0032, -0.0188,  0.0638,  ..., -0.0602,  0.0433,  0.0691],
        [-0.0705,  0.0332, -0.0491,  ...,  0.0665, -0.0046,  0.0087],
        [-0.0506, -0.0153,  0.0589,  ..., -0.0664, -0.0575, -0.0069]],
       requires_grad=True)
Parameter containing:
tensor([ 0.0376,  0.0694, -0.0425,  0.0447,  0.0314, -0.0348,  0.0143,  0.0132,
        -0.0621,  0.0297], requires_grad=True)


# Common Layer Types

- Linear Layers
- Convolution Layers
- Recurrent Layers
- Data Manipulation Layers: Max Pooling, Normalization, Dropout layers

# Activation functions

The main usage of Activation function is to introduce the nonlinearity in the neural network.

# Loss Functions
The Loss function measures the disparity btween the target's true value and the value model predicts.

**Optimizer** tells the neural network how to solve the problem. It is the algorithm that adjusts the weights to minimize the loss. Virtually all optimization algorithms is based on **Stochastic Gradient Decent**. 
- **Adam** is an SGD algorithm that has an adaptive learning rate that makes it general purpose optimizer suitable for most problems without any parameter tuning (for learning rate).