# Introduction

There are a few types of pytorch dropout that need to be used in order to implement the paper [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](https://arxiv.org/abs/1512.05287).

This notebook explores them in order to fully understand them

# weight_drop


This dropout shuts down completely network weights

In [1]:
from weight_drop import WeightDrop

from torch import nn
import torch

In [2]:
in_features = 3
out_features = 2
layer = nn.Linear(in_features,out_features)
with torch.no_grad():
    layer.weight.copy_(torch.ones(out_features,in_features))
    layer.bias.copy_(torch.ones(out_features))

Parameter containing:
tensor([[1., 1., 1.],
        [1., 1., 1.]], requires_grad=True)

Parameter containing:
tensor([1., 1.], requires_grad=True)

In [3]:
wdropout = WeightDrop(layer,['weight'],dropout=0.3,variational=True)

Applying weight drop of 0.3 to weight


Let's input data and see what happens

In [27]:
batch_size=2
seq_length = 5
data = torch.ones(size=(batch_size,2,in_features))

out = wdropout(data)

In [28]:
out[:,:,:]

tensor([[[5.2857, 5.2857],
         [5.2857, 5.2857]],

        [[5.2857, 5.2857],
         [5.2857, 5.2857]]], grad_fn=<SliceBackward>)

In [29]:
out[:,:,:]

tensor([[[5.2857, 5.2857],
         [5.2857, 5.2857]],

        [[5.2857, 5.2857],
         [5.2857, 5.2857]]], grad_fn=<SliceBackward>)

**ipad experimenting**

normal linear layer output

In [334]:
in_feat = 3
out_feat = 2
test_layer = nn.Linear(in_feat,out_feat)
print('ignore')
with torch.no_grad():
    test_layer.weight.copy_(torch.ones(out_feat,in_feat))
    test_layer.bias.copy_(torch.ones(out_feat))
print('ignore')

print('\nLayer parameters:')
for e in test_layer.parameters():
    e

print('\nDATA')
x = torch.ones(in_feat)
x

print('\nOutput')
test_layer(x)

ignore


Parameter containing:
tensor([[1., 1., 1.],
        [1., 1., 1.]], requires_grad=True)

Parameter containing:
tensor([1., 1.], requires_grad=True)

ignore

Layer parameters:


Parameter containing:
tensor([[1., 1., 1.],
        [1., 1., 1.]], requires_grad=True)

Parameter containing:
tensor([1., 1.], requires_grad=True)


DATA


tensor([1., 1., 1.])


Output:


tensor([4., 4.], grad_fn=<AddBackward0>)

Now turning off a weight, i.e. a whole column from M. 

In [427]:
# let's say the drop probability is 0.3
drop_prob = 0.3
keep_prob = 1-drop_prob
new_M = torch.tensor([0,1]).view(-1,1).expand_as(test_layer.weight) / keep_prob

with torch.no_grad():
    test_layer.weight.copy_(new_M)
    #don't mess with the bias i think

print('\nDATA')
x = torch.ones(in_feat)
x

print('\nOutput:')
test_layer(x)

Parameter containing:
tensor([[0.0000, 0.0000, 0.0000],
        [1.4286, 1.4286, 1.4286]], requires_grad=True)


DATA


tensor([1., 1., 1.])


Output:


tensor([1.0000, 5.2857], grad_fn=<AddBackward0>)

---

In [120]:
in_features = 5
out_features = 3
layer = nn.Linear(in_features,out_features)

batch_size=3
seq_length = 2
data = torch.rand(size=(batch_size,seq_length,in_features))

In [188]:
raw_w = getattr(layer,'weight')
raw_w

mask = torch.ones(raw_w.size(0),1)
mask
b = torch.nn.functional.dropout(mask,p=0.3,training=True)
b

Parameter containing:
tensor([[ 0.3896, -0.1608,  0.3731, -0.3409, -0.1235],
        [ 0.2571, -0.0441,  0.2779,  0.1124,  0.0486],
        [-0.3006,  0.0180, -0.2287,  0.3922,  0.1477]], requires_grad=True)

tensor([[1.],
        [1.],
        [1.]])

tensor([[1.4286],
        [1.4286],
        [1.4286]])

In [None]:
for name_w in self.weights:
            raw_w = getattr(self.module, name_w + '_raw')
            w = None

            if self.variational:
                #######################################################
                # Variational dropout (as proposed by Gal & Ghahramani)
                #######################################################
                mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1))
                if raw_w.is_cuda: mask = mask.cuda()
                mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True)
                w = mask.expand_as(raw_w) * raw_w
            else:
                #######################################################
                # DropConnect (as presented in the AWD paper)
                #######################################################
                w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)
            setattr(self.module, name_w, w)

## Conclusions

This dropout module drops network weights, not inputs

# locked_dropout

In [50]:
from locked_dropout import LockedDropout

from torch import nn
import torch

In [58]:
in_features = 50
out_features = 3
layer = nn.Linear(in_features,out_features)

In [59]:
batch_size=3
seq_length = 5
data = torch.rand(size=(batch_size,seq_length,in_features))

In [60]:
out = layer(data)
out.size()

torch.Size([3, 5, 3])

In [67]:
dropout = LockedDropout()

out_dropped = dropout(out,dropout=0.5)

In [74]:
out_dropped[0,3,:]

tensor([0.9252, -0.0000, 0.6426], grad_fn=<SliceBackward>)