## Batch Normalization

### With Dropout

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

# Training settings
args_batch_size = 64
args_test_batch_size = 1000
args_epochs = 10
args_lr = 0.01
args_momentum = 0.5
args_no_cuda = False
args_seed = 1
args_log_interval = 100

args_cuda = not args_no_cuda and torch.cuda.is_available()

torch.manual_seed(args_seed)
if args_cuda:
    torch.cuda.manual_seed(args_seed)


kwargs = {'num_workers': 1, 'pin_memory': True} if args_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data_mnist', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args_batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data_mnist', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args_test_batch_size, shuffle=True, **kwargs)


class Net(nn.Module):
    def __init__(self,p):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.batch1 = nn.BatchNorm2d(10)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.batch2 = nn.BatchNorm2d(20)
        self.conv2_drop = nn.Dropout2d(p=p)
        self.p = p
        self.fc1 = nn.Linear(320, 50)
        self.batch3 = nn.BatchNorm1d(50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(self.batch1(F.max_pool2d(self.conv1(x), 2)))
        x = F.relu(self.batch2(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)))
        x = x.view(-1, 320)
        x = F.relu(self.batch3(self.fc1(x)))
        x = F.dropout(x, training=self.training,p=self.p)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

model = Net(p=0.25)
if args_cuda:
    model.cuda()

optimizer = optim.SGD(model.parameters(), lr=args_lr, momentum=args_momentum)

def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if args_cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args_log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data[0]))

def test():
    model.eval()
    train_loss = 0
    train_correct = 0
    test_loss = 0
    test_correct = 0
    for data, target in train_loader:
        if args_cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        train_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        train_correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()
    for data, target in test_loader:
        if args_cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        test_correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()

    train_loss /= len(train_loader.dataset)
    test_loss /= len(test_loader.dataset)
    print('\nTrain set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        train_loss, train_correct, len(train_loader.dataset),
        100. * train_correct / len(train_loader.dataset)))
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, test_correct, len(test_loader.dataset),
        100. * test_correct / len(test_loader.dataset)))


for epoch in range(1, args_epochs + 1):
    train(epoch)
    test()


Train set: Average loss: 0.1498, Accuracy: 58149/60000 (97%)
Test set: Average loss: 0.1395, Accuracy: 9721/10000 (97%)


Train set: Average loss: 0.0758, Accuracy: 58820/60000 (98%)
Test set: Average loss: 0.0717, Accuracy: 9811/10000 (98%)


Train set: Average loss: 0.0561, Accuracy: 59084/60000 (98%)
Test set: Average loss: 0.0530, Accuracy: 9837/10000 (98%)


Train set: Average loss: 0.0485, Accuracy: 59196/60000 (99%)
Test set: Average loss: 0.0475, Accuracy: 9855/10000 (99%)


Train set: Average loss: 0.0404, Accuracy: 59302/60000 (99%)
Test set: Average loss: 0.0400, Accuracy: 9883/10000 (99%)


Train set: Average loss: 0.0365, Accuracy: 59385/60000 (99%)
Test set: Average loss: 0.0378, Accuracy: 9886/10000 (99%)


Train set: Average loss: 0.0344, Accuracy: 59436/60000 (99%)
Test set: Average loss: 0.0362, Accuracy: 9892/10000 (99%)


Train set: Average loss: 0.0314, Accuracy: 59443/60000 (99%)
Test set: Average loss: 0.0325, Accuracy: 9894/10000 (99%)


Train set: Average loss

### Without Dropout

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

# Training settings
args_batch_size = 64
args_test_batch_size = 1000
args_epochs = 10
args_lr = 0.01
args_momentum = 0.5
args_no_cuda = False
args_seed = 1
args_log_interval = 100

args_cuda = not args_no_cuda and torch.cuda.is_available()

torch.manual_seed(args_seed)
if args_cuda:
    torch.cuda.manual_seed(args_seed)


kwargs = {'num_workers': 1, 'pin_memory': True} if args_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data_mnist', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args_batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data_mnist', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args_test_batch_size, shuffle=True, **kwargs)


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.batch1 = nn.BatchNorm2d(10)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.batch2 = nn.BatchNorm2d(20)
        # self.conv2_drop = nn.Dropout2d(p=p)
        # self.p = p
        self.fc1 = nn.Linear(320, 50)
        self.batch3 = nn.BatchNorm1d(50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(self.batch1(F.max_pool2d(self.conv1(x), 2)))
        x = F.relu(self.batch2(F.max_pool2d(self.conv2(x), 2)))
        x = x.view(-1, 320)
        x = F.relu(self.batch3(self.fc1(x)))
        # x = F.dropout(x, training=self.training,p=self.p)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

model = Net()
if args_cuda:
    model.cuda()

optimizer = optim.SGD(model.parameters(), lr=args_lr, momentum=args_momentum)

def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if args_cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args_log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data[0]))

def test():
    model.eval()
    train_loss = 0
    train_correct = 0
    test_loss = 0
    test_correct = 0
    for data, target in train_loader:
        if args_cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        train_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        train_correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()
    for data, target in test_loader:
        if args_cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        test_correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()

    train_loss /= len(train_loader.dataset)
    test_loss /= len(test_loader.dataset)
    print('\nTrain set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        train_loss, train_correct, len(train_loader.dataset),
        100. * train_correct / len(train_loader.dataset)))
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, test_correct, len(test_loader.dataset),
        100. * test_correct / len(test_loader.dataset)))


for epoch in range(1, args_epochs + 1):
    train(epoch)
    test()


Train set: Average loss: 0.1138, Accuracy: 58708/60000 (98%)
Test set: Average loss: 0.1098, Accuracy: 9787/10000 (98%)


Train set: Average loss: 0.0640, Accuracy: 59163/60000 (99%)
Test set: Average loss: 0.0620, Accuracy: 9860/10000 (99%)


Train set: Average loss: 0.0476, Accuracy: 59330/60000 (99%)
Test set: Average loss: 0.0504, Accuracy: 9864/10000 (99%)


Train set: Average loss: 0.0369, Accuracy: 59479/60000 (99%)
Test set: Average loss: 0.0419, Accuracy: 9883/10000 (99%)


Train set: Average loss: 0.0299, Accuracy: 59580/60000 (99%)
Test set: Average loss: 0.0353, Accuracy: 9899/10000 (99%)


Train set: Average loss: 0.0251, Accuracy: 59627/60000 (99%)
Test set: Average loss: 0.0312, Accuracy: 9918/10000 (99%)


Train set: Average loss: 0.0219, Accuracy: 59698/60000 (99%)
Test set: Average loss: 0.0286, Accuracy: 9919/10000 (99%)


Train set: Average loss: 0.0210, Accuracy: 59709/60000 (100%)
Test set: Average loss: 0.0314, Accuracy: 9906/10000 (99%)


Train set: Average los

## Report

* Using Batch Normalization with dropout:
  * Train performance: `59747/60000 (100%)`
  * Test performance: `9900/10000 (99%)`
* Using batch normalization has increased the performance. This is expected as normalizing helps learn faster and gives better performance.
* Using Batch Normalization without dropout:
  * Train performance: `59520/60000 (99%)`
  * Test performance: `9903/10000 (99%)`
* In this case with batch normalization, with dropout performs better than without dropout, whereas in case of test performace, they give similar performance.
* Batch normalization with proper dropout probability is expected to perform better. In this case, if we had the probability that has maximum performace, the model would have learnt better. (We took probability with maximun performance among only 4 values, more should have been tried.)