# Building Models with PyTorch

torch.nn.Module and torch.nn.Parameter

In [1]:
import torch

class TinyModel(torch.nn.Module):
    
    def __init__(self):
        super(TinyModel, self).__init__()
        
        self.linear1 = torch.nn.Linear(100, 200)
        self.activation = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(200, 10)
        self.softmax = torch.nn.Softmax()
        
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.softmax(x)
        return x
    
tinymodel = TinyModel()

print('The Model: ')
print(tinymodel)

print('\n\nJust one layer: ')
print(tinymodel.linear2)

print('\n\nModel parameters: ')
for param in  tinymodel.parameters():
    print(param)
    
print('\n\nLayer params: ')
for param in tinymodel.linear2.parameters():
    print(param)

The Model: 
TinyModel(
  (linear1): Linear(in_features=100, out_features=200, bias=True)
  (activation): ReLU()
  (linear2): Linear(in_features=200, out_features=10, bias=True)
  (softmax): Softmax(dim=None)
)


Just one layer: 
Linear(in_features=200, out_features=10, bias=True)


Model parameters: 
Parameter containing:
tensor([[ 0.0228, -0.0951, -0.0068,  ...,  0.0135,  0.0041, -0.0894],
        [-0.0156, -0.0455, -0.0296,  ...,  0.0958,  0.0287, -0.0145],
        [-0.0017,  0.0066,  0.0915,  ...,  0.0712, -0.0852,  0.0636],
        ...,
        [ 0.0299,  0.0421,  0.0352,  ..., -0.0230,  0.0389, -0.0771],
        [-0.0827,  0.0512, -0.0924,  ...,  0.0151, -0.0472, -0.0377],
        [ 0.0446,  0.0263, -0.0705,  ...,  0.0504, -0.0786, -0.0434]],
       requires_grad=True)
Parameter containing:
tensor([ 7.5023e-02, -8.9974e-02,  7.6222e-02, -3.6827e-02, -8.4915e-02,
        -5.1428e-03, -5.5815e-03,  3.3337e-03,  1.4258e-02,  5.8927e-02,
        -8.5550e-02,  5.2066e-02, -7.2367e-02, 

Common Layer Types

Most basic type of neural network is linear or fully connected layer.

In [2]:
lin = torch.nn.Linear(3, 2)
x = torch.rand(1, 3)
print('Input: ')
print(x)

print('\n\nWeight and Bias parameters:')
for param in lin.parameters():
    print(param)
    
y = lin(x)
print('\n\nOutput: ')
print(y)

Input: 
tensor([[0.0746, 0.3789, 0.3715]])


Weight and Bias parameters:
Parameter containing:
tensor([[-0.0777, -0.4211, -0.5420],
        [ 0.0687,  0.4271, -0.4527]], requires_grad=True)
Parameter containing:
tensor([ 0.1407, -0.3300], requires_grad=True)


Output: 
tensor([[-0.2260, -0.3313]], grad_fn=<AddmmBackward0>)


If we multiply 'x' by the linear layer's weights, and add the biases, we will get the output 'y'

Convolutional Layers(CNN)

In [3]:
import torch.functional as F

class LeNet(torch.nn.Module):
    
    def __init__(self):
        super(LeNet, self).__init__()
        # 1 input image channel (black n white), 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = torch.nn.Conv2d(1, 6, 5)  # (no of input channels, no of output features, window/kernel size)
        # output tensor to con1 gives us 6x28x28. 6 features with 28x28 height n width of map
        self.conv2 = torch.nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = torch.nn.Linear(16 * 6 * 6, 120) # 6 * 6 from image resolution
        self.fc2 = torch.nn.Linear(120, 84)
        self.fc3 = torch.nn.Linear(84, 10)
        
    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is square, we can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = F.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:] # all dimensions except batch domension
        num_features = 1
        for s in size:
            num_features *= s
            
        return num_features
        

Recurrent Layers (RNN)

RNN are used for sequential data, from time series to the DNA nucleotides. An RNN does this by maintaining a hidden state that acts as a srt of memory for what is has been seen in hte sequence so far

In [4]:
class LSTMTagger(torch.nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM takes work embeddings as inputs and outputs hidden states with dimensionality hidden_dim
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
        
        # the linear layer that maps from hidden state space to tag space
        self.hidden2tag = torch.nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores
    
    

Constructor has four arguments:
    1. vocab_size : no of words in input vocabulary. Each word is one-hot vector
    2. tagset_size : no of tags in the output set
    3. embedding_dim : size of embedding space for the vocabulary
    4. hidden_dim : the size of the LSTM's memory

Data Manipulation Layers

Max pooling reduce a tensor by combining cells, an dassinging the maximumvalue of the input cells to the output cell. This works as a layer that perform important function, but don't participate in larning process

In [5]:
my_tensor = torch.rand(1, 6, 6)
print(my_tensor)

maxpool_layer = torch.nn.MaxPool2d(3)
print(maxpool_layer(my_tensor))

tensor([[[0.4453, 0.1965, 0.2766, 0.4354, 0.4527, 0.4815],
         [0.0483, 0.6444, 0.4422, 0.0401, 0.0244, 0.8276],
         [0.0756, 0.5548, 0.3653, 0.3061, 0.4284, 0.8537],
         [0.2576, 0.9806, 0.3392, 0.6383, 0.0998, 0.2208],
         [0.6323, 0.5258, 0.1660, 0.8183, 0.0055, 0.2510],
         [0.7725, 0.0396, 0.5836, 0.8612, 0.5755, 0.1271]]])
tensor([[[0.6444, 0.8537],
         [0.9806, 0.8612]]])


Normalization layer re-center and normalize the output of one layer before feeding it to another. Centering and scaling the intermediate tensors has a number of beneficial effects, such as letting use higher learning rates without exploding/vanishing gradients

In [6]:
my_tensor = torch.rand(1, 4, 4) * 20 + 5
print(my_tensor)

print(my_tensor.mean())

norm_layer = torch.nn.BatchNorm1d(4)
normed_tensor = norm_layer(my_tensor)
print(normed_tensor)

print(normed_tensor.mean())

tensor([[[ 8.9663, 18.1267, 23.2359, 19.7406],
         [13.7878, 13.1657, 16.5746,  9.4849],
         [18.8747, 14.9917, 10.2237,  9.2706],
         [18.0279, 10.0434, 19.5325, 13.1947]]])
tensor(14.8276)
tensor([[[-1.6223,  0.1156,  1.0849,  0.4218],
         [ 0.2116, -0.0346,  1.3148, -1.4918],
         [ 1.4334,  0.4277, -0.8072, -1.0540],
         [ 0.7467, -1.3614,  1.1440, -0.5294]]],
       grad_fn=<NativeBatchNormBackward0>)
tensor(-5.9605e-08, grad_fn=<MeanBackward0>)


Dropout layers are tools for encouraging sparse representations in the model

In [11]:
my_tensor = torch.rand(1, 4, 4)

dropout = torch.nn.Dropout(p=0.2)
print(dropout(my_tensor))
print(dropout(my_tensor))

tensor([[[0.6438, 0.7842, 0.3329, 0.7233],
         [0.0000, 0.7365, 0.1289, 0.9480],
         [1.2457, 0.7169, 0.1661, 1.2080],
         [1.0845, 0.0000, 1.0175, 0.4237]]])
tensor([[[0.6438, 0.7842, 0.3329, 0.0000],
         [0.0000, 0.7365, 0.1289, 0.9480],
         [1.2457, 0.7169, 0.1661, 0.0000],
         [1.0845, 0.0000, 0.0000, 0.4237]]])
