In [1]:
import torch
import numpy as np


# Dropout from scratch

In [2]:
''' 
Check dropout behavior on a simple NN with the following architecture:
- 4 input nodes
- hidden layer 1 with 6 nodes + bias
- hidden layer 2 with 5 nodes + bias
- output layer with 1 node
- relu activations used in NN hidden layers
ref: # https://nilanjanchattopadhyay.github.io/basics/2020/04/20/Regularization-from-Scratch-Dropout.html

'''
torch.manual_seed(0)
############# TRAINING #############


# Mini Batch of 10 elements
# Input with 2 dimensions
X = torch.rand(10, 4)

# Initialize the weights and biases
# hidden layer 1
W1 = torch.rand(4, 6)
B1 = torch.rand(6)

# hidden layer 2
W2 = torch.rand(6, 5)
B2 = torch.rand(5)

# output layer
W3 = torch.rand(5, 1)
B3 = torch.rand(1)


In [3]:

'''To apply dropout we will create a binary vector, commonly called as binary mask, where 1’s 
will represent the units to keep and 0’s will represent the units to drop.'''

'''Let’s apply dropout to its hidden layers with p=0.6. 
    p  is the ‘keep probability’. This makes the probability of a hidden unit being dropped equal 1−p=0.4
'''

########################### TRAINING ###########################

H1 = X @ W1 + B1
H1.clamp_(0) # relu activation in-place
p = 0.6
mask = torch.zeros(1,6).bernoulli_(1-p) # dropout on hidden layer 1 with keep prob=0.6
H1 *= mask

H2 = H1 @ W2 + B2
H2.clamp_(0) # relu activation in-place
p = 0.5
mask = torch.zeros(1,5).bernoulli_(1-p) # dropout on hidden layer 1 with keep prob=0.5
H2 *= mask

out = H2 @ W3 + B3
print(H1, '\n', H2)

tensor([[0.0000, 0.0000, 1.1927, 1.1133, 1.5882, 0.0000],
        [0.0000, 0.0000, 1.9356, 1.6609, 1.8461, 0.0000],
        [0.0000, 0.0000, 1.5445, 1.2891, 1.6806, 0.0000],
        [0.0000, 0.0000, 1.2012, 1.1219, 1.2317, 0.0000],
        [0.0000, 0.0000, 1.4890, 1.2556, 1.7159, 0.0000],
        [0.0000, 0.0000, 2.1387, 1.7740, 2.0687, 0.0000],
        [0.0000, 0.0000, 1.7402, 1.1175, 1.8941, 0.0000],
        [0.0000, 0.0000, 1.6630, 1.5181, 1.5129, 0.0000],
        [0.0000, 0.0000, 0.8459, 0.8050, 1.1516, 0.0000],
        [0.0000, 0.0000, 2.0426, 1.7400, 2.1815, 0.0000]]) 
 tensor([[2.5590, 3.6239, 2.9962, 0.0000, 0.0000],
        [2.9882, 4.7482, 4.0198, 0.0000, 0.0000],
        [2.7072, 4.0990, 3.4328, 0.0000, 0.0000],
        [2.3193, 3.3671, 2.7001, 0.0000, 0.0000],
        [2.7155, 4.0592, 3.4051, 0.0000, 0.0000],
        [3.1943, 5.1549, 4.4203, 0.0000, 0.0000],
        [2.7884, 4.3728, 3.7417, 0.0000, 0.0000],
        [2.6919, 4.1803, 3.4553, 0.0000, 0.0000],
        [2.1207, 

In [4]:
########################### INFERENCE ###########################
H1 = X @ W1 + B1
H1.clamp_(0) # relu activation in-place
p = 0.6
# Scaling the output of Hidden Layer 1
H1 *= (1-p)

H2 = H1 @ W2 + B2
H2.clamp_(0) # relu activation in-place
# Scaling the output of Hidden Layer 2
p = 0.5
H2 *= (1-p)

out = H2 @ W3 + B3
print(H1, '\n', H2)

tensor([[0.6977, 0.4997, 0.4771, 0.4453, 0.6353, 0.7849],
        [0.7733, 0.6038, 0.7742, 0.6644, 0.7384, 0.9302],
        [0.7341, 0.5378, 0.6178, 0.5156, 0.6722, 0.8087],
        [0.4835, 0.3474, 0.4805, 0.4488, 0.4927, 0.6098],
        [0.7799, 0.5604, 0.5956, 0.5022, 0.6863, 0.8594],
        [0.9164, 0.7062, 0.8555, 0.7096, 0.8275, 1.0686],
        [0.8468, 0.6108, 0.6961, 0.4470, 0.7577, 0.7054],
        [0.6130, 0.4707, 0.6652, 0.6072, 0.6052, 0.8153],
        [0.4789, 0.3148, 0.3384, 0.3220, 0.4607, 0.5317],
        [0.9045, 0.7260, 0.8170, 0.6960, 0.8726, 1.0088]]) 
 tensor([[1.1470, 1.5303, 1.1804, 1.0582, 0.6567],
        [1.2989, 1.8529, 1.4588, 1.2406, 0.8280],
        [1.1988, 1.6515, 1.2888, 1.1237, 0.7288],
        [0.9997, 1.3338, 0.9887, 0.8613, 0.5247],
        [1.2181, 1.6766, 1.3153, 1.1538, 0.7498],
        [1.4087, 2.0394, 1.6339, 1.3879, 0.9481],
        [1.2468, 1.7073, 1.3610, 1.1951, 0.7939],
        [1.1577, 1.6309, 1.2510, 1.0591, 0.6899],
        [0.9376, 

# Inverted Dropout


In [5]:
########################### TRAINING ###########################

H1 = X @ W1 + B1
H1.clamp_(0) # relu activation in-place
p = 0.6
mask = torch.zeros(1,6).bernoulli_(1-p) # dropout on hidden layer 1 with keep prob=0.6
H1 *= mask
# Scaling the output of Hidden Layer 1
H1 /= (1-p)

H2 = H1 @ W2 + B2
H2.clamp_(0) # relu activation in-place
p = 0.5
mask = torch.zeros(1,5).bernoulli_(1-p) # dropout on hidden layer 1 with keep prob=0.5
H2 *= mask
# Scaling the output of Hidden Layer 2
H2 /= (1-p)

out = H2 @ W3 + B3
print(H1, '\n', H2)

tensor([[4.3605, 0.0000, 0.0000, 0.0000, 3.9705, 0.0000],
        [4.8330, 0.0000, 0.0000, 0.0000, 4.6152, 0.0000],
        [4.5881, 0.0000, 0.0000, 0.0000, 4.2014, 0.0000],
        [3.0219, 0.0000, 0.0000, 0.0000, 3.0793, 0.0000],
        [4.8746, 0.0000, 0.0000, 0.0000, 4.2896, 0.0000],
        [5.7276, 0.0000, 0.0000, 0.0000, 5.1718, 0.0000],
        [5.2927, 0.0000, 0.0000, 0.0000, 4.7353, 0.0000],
        [3.8315, 0.0000, 0.0000, 0.0000, 3.7824, 0.0000],
        [2.9930, 0.0000, 0.0000, 0.0000, 2.8791, 0.0000],
        [5.6530, 0.0000, 0.0000, 0.0000, 5.4538, 0.0000]]) 
 tensor([[ 0.0000,  0.0000,  0.0000, 14.1767,  8.2431],
        [ 0.0000,  0.0000,  0.0000, 15.9266,  9.1336],
        [ 0.0000,  0.0000,  0.0000, 14.9103,  8.6712],
        [ 0.0000,  0.0000,  0.0000, 10.5048,  5.7306],
        [ 0.0000,  0.0000,  0.0000, 15.5553,  9.2079],
        [ 0.0000,  0.0000,  0.0000, 18.3270, 10.8122],
        [ 0.0000,  0.0000,  0.0000, 16.9322,  9.9944],
        [ 0.0000,  0.0000,  0.00