In [None]:
import torch
import torch.nn as nn

x = torch.full((3, 5), 1.0).requires_grad_()
dropout = nn.Dropout(p=0.75)
y = dropout(x)
y

tensor([[0., 0., 4., 0., 0.],
        [0., 4., 0., 4., 4.],
        [0., 0., 0., 4., 0.]], grad_fn=<MulBackward0>)

In [None]:
l = y.norm(2, 1).sum()
l

tensor(14.9282, grad_fn=<SumBackward0>)

In [None]:
l.backward()
x.grad

tensor([[0.0000, 0.0000, 4.0000, 0.0000, 0.0000],
        [0.0000, 2.3094, 0.0000, 2.3094, 2.3094],
        [0.0000, 0.0000, 0.0000, 4.0000, 0.0000]])

In [1]:
import torch
import torch.nn as nn

class Dummy(nn.Module):
    def __init__(self, m):
        super().__init__()
        self.m = m
    
    def forward(self, x):
        print('Dummy.forward', x.size(), x.device)
        return self.m(x)

x = torch.randn(50, 10)
model = Dummy(nn.Linear(10, 5))

print('On CPU')
y = model(x)

On CPU
Dummy.forward torch.Size([50, 10]) cpu


In [3]:
x = x.to('cuda')
model.to('cuda')

print('On GPU w/o nn.DataParallel')
y = model(x)

On GPU w/o nn.DataParallel
Dummy.forward torch.Size([50, 10]) cuda:0


In [4]:
print('On GPU w/ nn.DataParallel')
parallel_model = nn.DataParallel(model)
y = parallel_model(x)

On GPU w/ nn.DataParallel
Dummy.forward torch.Size([50, 10]) cuda:0
