In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import OneCycleLR
from torchsummary import summary
from tqdm import tqdm
import numpy as np

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f"Using device: {device}")

class Cutout(object):
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)
        for n in range(self.n_holes):
            y, x = np.random.randint(h), np.random.randint(w)
            y1, y2 = np.clip(y - self.length // 2, 0, h), np.clip(y + self.length // 2, 0, h)
            x1, x2 = np.clip(x - self.length // 2, 0, w), np.clip(x + self.length // 2, 0, w)
            mask[y1: y2, x1: x2] = 0.
        mask = torch.from_numpy(mask).expand_as(img)
        return img * mask

torch.manual_seed(1)
batch_size = 128
train_transforms = transforms.Compose([
    transforms.RandomAffine(degrees=7, translate=(0.1, 0.1), scale=(0.9, 1.1), shear=10, fill=0.1307),
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),
    Cutout(n_holes=1, length=16)
])
test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=True, download=True, transform=train_transforms), batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=False, transform=test_transforms), batch_size=batch_size, shuffle=True, **kwargs)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(16), nn.ReLU(), nn.Dropout(0.1),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(32), nn.ReLU(), nn.Dropout(0.1)
        )
        self.transblock1 = nn.Sequential(
            nn.MaxPool2d(2, 2),
            nn.Conv2d(in_channels=32, out_channels=16, kernel_size=1, bias=False),
            nn.BatchNorm2d(16), nn.ReLU()
        )
        self.convblock2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(32), nn.ReLU(), nn.Dropout(0.1),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(32), nn.ReLU(), nn.Dropout(0.1)
        )
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.output_conv = nn.Conv2d(in_channels=32, out_channels=10, kernel_size=1, bias=False)

    def forward(self, x):
        x = self.convblock1(x)
        x = self.transblock1(x)
        x = self.convblock2(x)
        x = self.gap(x)
        x = self.output_conv(x)
        x = x.view(-1, 10)
        return F.log_softmax(x, dim=-1)

model = Net().to(device)
summary(model, input_size=(1, 28, 28))

def train(model, device, train_loader, optimizer, scheduler):
    model.train()
    pbar = tqdm(train_loader)
    correct, processed = 0, 0
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        scheduler.step()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        processed += len(data)
        pbar.set_description(desc=f'Loss={loss.item():.4f} LR={scheduler.get_last_lr()[0]:.6f} Acc={100*correct/processed:0.2f}')

def test(model, device, test_loader):
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    print(f'\nTest set: Avg loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.*correct/len(test_loader.dataset):.2f}%)\n')

EPOCHS = 20
model = Net().to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.01)
scheduler = OneCycleLR(optimizer,
                       max_lr=0.02, 
                       steps_per_epoch=len(train_loader),
                       epochs=EPOCHS,
                       anneal_strategy='linear')

for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}")
    train(model, device, train_loader, optimizer, scheduler)
    test(model, device, test_loader)

Using device: cuda


100%|██████████| 9.91M/9.91M [00:00<00:00, 16.9MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 456kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 3.61MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 7.76MB/s]


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 28, 28]             144
       BatchNorm2d-2           [-1, 16, 28, 28]              32
              ReLU-3           [-1, 16, 28, 28]               0
           Dropout-4           [-1, 16, 28, 28]               0
            Conv2d-5           [-1, 32, 28, 28]           4,608
       BatchNorm2d-6           [-1, 32, 28, 28]              64
              ReLU-7           [-1, 32, 28, 28]               0
           Dropout-8           [-1, 32, 28, 28]               0
         MaxPool2d-9           [-1, 32, 14, 14]               0
           Conv2d-10           [-1, 16, 14, 14]             512
      BatchNorm2d-11           [-1, 16, 14, 14]              32
             ReLU-12           [-1, 16, 14, 14]               0
           Conv2d-13           [-1, 32, 14, 14]           4,608
      BatchNorm2d-14           [-1, 32,

Loss=0.6672 LR=0.004001 Acc=57.41: 100%|██████████| 469/469 [00:23<00:00, 19.66it/s]



Test set: Avg loss: 0.4810, Accuracy: 8524/10000 (85.24%)

Epoch 2


Loss=0.5425 LR=0.007202 Acc=80.06: 100%|██████████| 469/469 [00:23<00:00, 20.13it/s]



Test set: Avg loss: 0.1333, Accuracy: 9566/10000 (95.66%)

Epoch 3


Loss=0.2821 LR=0.010403 Acc=83.68: 100%|██████████| 469/469 [00:23<00:00, 19.87it/s]



Test set: Avg loss: 0.1025, Accuracy: 9676/10000 (96.76%)

Epoch 4


Loss=0.4846 LR=0.013605 Acc=84.72: 100%|██████████| 469/469 [00:23<00:00, 19.81it/s]



Test set: Avg loss: 0.1138, Accuracy: 9651/10000 (96.51%)

Epoch 5


Loss=0.6716 LR=0.016806 Acc=85.64: 100%|██████████| 469/469 [00:23<00:00, 19.64it/s]



Test set: Avg loss: 0.1228, Accuracy: 9611/10000 (96.11%)

Epoch 6


Loss=0.5944 LR=0.019997 Acc=86.12: 100%|██████████| 469/469 [00:23<00:00, 19.92it/s]



Test set: Avg loss: 0.0554, Accuracy: 9809/10000 (98.09%)

Epoch 7


Loss=0.3755 LR=0.018568 Acc=86.56: 100%|██████████| 469/469 [00:24<00:00, 18.96it/s]



Test set: Avg loss: 0.0822, Accuracy: 9746/10000 (97.46%)

Epoch 8


Loss=0.2837 LR=0.017140 Acc=87.62: 100%|██████████| 469/469 [00:24<00:00, 19.14it/s]



Test set: Avg loss: 0.0520, Accuracy: 9816/10000 (98.16%)

Epoch 9


Loss=0.2989 LR=0.015711 Acc=87.85: 100%|██████████| 469/469 [00:23<00:00, 19.75it/s]



Test set: Avg loss: 0.0510, Accuracy: 9835/10000 (98.35%)

Epoch 10


Loss=0.3472 LR=0.014283 Acc=88.32: 100%|██████████| 469/469 [00:25<00:00, 18.63it/s]



Test set: Avg loss: 0.0444, Accuracy: 9852/10000 (98.52%)

Epoch 11


Loss=0.3519 LR=0.012854 Acc=88.61: 100%|██████████| 469/469 [00:24<00:00, 19.45it/s]



Test set: Avg loss: 0.0438, Accuracy: 9857/10000 (98.57%)

Epoch 12


Loss=0.2176 LR=0.011426 Acc=88.72: 100%|██████████| 469/469 [00:24<00:00, 19.25it/s]



Test set: Avg loss: 0.0344, Accuracy: 9896/10000 (98.96%)

Epoch 13


Loss=0.2429 LR=0.009997 Acc=89.02: 100%|██████████| 469/469 [00:24<00:00, 19.32it/s]



Test set: Avg loss: 0.0337, Accuracy: 9890/10000 (98.90%)

Epoch 14


Loss=0.3109 LR=0.008568 Acc=89.41: 100%|██████████| 469/469 [00:24<00:00, 19.09it/s]



Test set: Avg loss: 0.0382, Accuracy: 9867/10000 (98.67%)

Epoch 15


Loss=0.2743 LR=0.007140 Acc=89.40: 100%|██████████| 469/469 [00:24<00:00, 18.95it/s]



Test set: Avg loss: 0.0372, Accuracy: 9869/10000 (98.69%)

Epoch 16


Loss=0.3844 LR=0.005711 Acc=89.65: 100%|██████████| 469/469 [00:24<00:00, 18.90it/s]



Test set: Avg loss: 0.0344, Accuracy: 9884/10000 (98.84%)

Epoch 17


Loss=0.2142 LR=0.004283 Acc=90.27: 100%|██████████| 469/469 [00:24<00:00, 18.94it/s]



Test set: Avg loss: 0.0270, Accuracy: 9908/10000 (99.08%)

Epoch 18


Loss=0.3137 LR=0.002854 Acc=90.42: 100%|██████████| 469/469 [00:24<00:00, 19.04it/s]



Test set: Avg loss: 0.0267, Accuracy: 9910/10000 (99.10%)

Epoch 19


Loss=0.3615 LR=0.001426 Acc=90.82: 100%|██████████| 469/469 [00:24<00:00, 18.96it/s]



Test set: Avg loss: 0.0222, Accuracy: 9921/10000 (99.21%)

Epoch 20


Loss=0.2882 LR=-0.000003 Acc=90.94: 100%|██████████| 469/469 [00:25<00:00, 18.15it/s]



Test set: Avg loss: 0.0216, Accuracy: 9921/10000 (99.21%)



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import OneCycleLR
from torchsummary import summary
from tqdm import tqdm
import numpy as np

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f"Using device: {device}")

class Cutout(object):
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)
        for n in range(self.n_holes):
            y, x = np.random.randint(h), np.random.randint(w)
            y1, y2 = np.clip(y - self.length // 2, 0, h), np.clip(y + self.length // 2, 0, h)
            x1, x2 = np.clip(x - self.length // 2, 0, w), np.clip(x + self.length // 2, 0, w)
            mask[y1: y2, x1: x2] = 0.
        mask = torch.from_numpy(mask).expand_as(img)
        return img * mask

torch.manual_seed(1)
batch_size = 128
train_transforms = transforms.Compose([
    transforms.RandomAffine(degrees=7, translate=(0.1, 0.1), scale=(0.9, 1.1), shear=10, fill=0.1307),
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),
    Cutout(n_holes=1, length=16)
])
test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=True, download=True, transform=train_transforms), batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=False, transform=test_transforms), batch_size=batch_size, shuffle=True, **kwargs)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(16), nn.ReLU(), nn.Dropout(0.1),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(32), nn.ReLU(), nn.Dropout(0.1)
        )
        self.transblock1 = nn.Sequential(
            nn.MaxPool2d(2, 2),
            nn.Conv2d(in_channels=32, out_channels=16, kernel_size=1, bias=False),
            nn.BatchNorm2d(16), nn.ReLU()
        )
        self.convblock2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(32), nn.ReLU(), nn.Dropout(0.1),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(32), nn.ReLU(), nn.Dropout(0.1)
        )
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.output_conv = nn.Conv2d(in_channels=32, out_channels=10, kernel_size=1, bias=False)

    def forward(self, x):
        x = self.convblock1(x)
        x = self.transblock1(x)
        x = self.convblock2(x)
        x = self.gap(x)
        x = self.output_conv(x)
        x = x.view(-1, 10)
        return F.log_softmax(x, dim=-1)

model = Net().to(device)
summary(model, input_size=(1, 28, 28))

def train(model, device, train_loader, optimizer, scheduler):
    model.train()
    pbar = tqdm(train_loader)
    correct, processed = 0, 0
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        scheduler.step()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        processed += len(data)
        pbar.set_description(desc=f'Loss={loss.item():.4f} LR={scheduler.get_last_lr()[0]:.6f} Acc={100*correct/processed:0.2f}')

def test(model, device, test_loader):
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    print(f'\nTest set: Avg loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.*correct/len(test_loader.dataset):.2f}%)\n')

EPOCHS = 20
model = Net().to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.001)
scheduler = OneCycleLR(optimizer,
                       max_lr=0.02,
                       steps_per_epoch=len(train_loader),
                       epochs=EPOCHS,
                       anneal_strategy='linear')

for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}")
    train(model, device, train_loader, optimizer, scheduler)
    test(model, device, test_loader)

Using device: cuda


100%|██████████| 9.91M/9.91M [00:01<00:00, 5.04MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 133kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.26MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 14.6MB/s]


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 28, 28]             144
       BatchNorm2d-2           [-1, 16, 28, 28]              32
              ReLU-3           [-1, 16, 28, 28]               0
           Dropout-4           [-1, 16, 28, 28]               0
            Conv2d-5           [-1, 32, 28, 28]           4,608
       BatchNorm2d-6           [-1, 32, 28, 28]              64
              ReLU-7           [-1, 32, 28, 28]               0
           Dropout-8           [-1, 32, 28, 28]               0
         MaxPool2d-9           [-1, 32, 14, 14]               0
           Conv2d-10           [-1, 16, 14, 14]             512
      BatchNorm2d-11           [-1, 16, 14, 14]              32
             ReLU-12           [-1, 16, 14, 14]               0
           Conv2d-13           [-1, 32, 14, 14]           4,608
      BatchNorm2d-14           [-1, 32,

Loss=0.6838 LR=0.004001 Acc=57.49: 100%|██████████| 469/469 [00:24<00:00, 19.42it/s]



Test set: Avg loss: 0.4262, Accuracy: 8670/10000 (86.70%)

Epoch 2


Loss=0.5616 LR=0.007202 Acc=80.14: 100%|██████████| 469/469 [00:23<00:00, 20.04it/s]



Test set: Avg loss: 0.1624, Accuracy: 9444/10000 (94.44%)

Epoch 3


Loss=0.2640 LR=0.010403 Acc=83.48: 100%|██████████| 469/469 [00:23<00:00, 19.96it/s]



Test set: Avg loss: 0.0934, Accuracy: 9713/10000 (97.13%)

Epoch 4


Loss=0.4887 LR=0.013605 Acc=84.62: 100%|██████████| 469/469 [00:23<00:00, 19.94it/s]



Test set: Avg loss: 0.0880, Accuracy: 9720/10000 (97.20%)

Epoch 5


Loss=0.6058 LR=0.016806 Acc=85.77: 100%|██████████| 469/469 [00:23<00:00, 20.07it/s]



Test set: Avg loss: 0.0619, Accuracy: 9816/10000 (98.16%)

Epoch 6


Loss=0.5585 LR=0.019997 Acc=86.23: 100%|██████████| 469/469 [00:24<00:00, 19.01it/s]



Test set: Avg loss: 0.0813, Accuracy: 9729/10000 (97.29%)

Epoch 7


Loss=0.3345 LR=0.018568 Acc=87.05: 100%|██████████| 469/469 [00:23<00:00, 19.56it/s]



Test set: Avg loss: 0.0557, Accuracy: 9816/10000 (98.16%)

Epoch 8


Loss=0.2733 LR=0.017140 Acc=88.03: 100%|██████████| 469/469 [00:23<00:00, 19.82it/s]



Test set: Avg loss: 0.0459, Accuracy: 9847/10000 (98.47%)

Epoch 9


Loss=0.2842 LR=0.015711 Acc=88.25: 100%|██████████| 469/469 [00:23<00:00, 19.67it/s]



Test set: Avg loss: 0.0385, Accuracy: 9874/10000 (98.74%)

Epoch 10


Loss=0.3386 LR=0.014283 Acc=88.67: 100%|██████████| 469/469 [00:23<00:00, 19.59it/s]



Test set: Avg loss: 0.0413, Accuracy: 9856/10000 (98.56%)

Epoch 11


Loss=0.3318 LR=0.012854 Acc=88.93: 100%|██████████| 469/469 [00:23<00:00, 20.10it/s]



Test set: Avg loss: 0.0342, Accuracy: 9890/10000 (98.90%)

Epoch 12


Loss=0.2687 LR=0.011426 Acc=89.13: 100%|██████████| 469/469 [00:24<00:00, 19.48it/s]



Test set: Avg loss: 0.0386, Accuracy: 9863/10000 (98.63%)

Epoch 13


Loss=0.2568 LR=0.009997 Acc=89.52: 100%|██████████| 469/469 [00:23<00:00, 20.10it/s]



Test set: Avg loss: 0.0297, Accuracy: 9896/10000 (98.96%)

Epoch 14


Loss=0.2615 LR=0.008568 Acc=89.73: 100%|██████████| 469/469 [00:22<00:00, 20.66it/s]



Test set: Avg loss: 0.0311, Accuracy: 9894/10000 (98.94%)

Epoch 15


Loss=0.3211 LR=0.007140 Acc=89.77: 100%|██████████| 469/469 [00:22<00:00, 20.52it/s]



Test set: Avg loss: 0.0279, Accuracy: 9913/10000 (99.13%)

Epoch 16


Loss=0.3581 LR=0.005711 Acc=90.25: 100%|██████████| 469/469 [00:22<00:00, 20.56it/s]



Test set: Avg loss: 0.0271, Accuracy: 9897/10000 (98.97%)

Epoch 17


Loss=0.1860 LR=0.004283 Acc=90.52: 100%|██████████| 469/469 [00:22<00:00, 20.44it/s]



Test set: Avg loss: 0.0219, Accuracy: 9924/10000 (99.24%)

Epoch 18


Loss=0.2676 LR=0.002854 Acc=90.69: 100%|██████████| 469/469 [00:24<00:00, 19.00it/s]



Test set: Avg loss: 0.0210, Accuracy: 9925/10000 (99.25%)

Epoch 19


Loss=0.3406 LR=0.001426 Acc=91.11: 100%|██████████| 469/469 [00:23<00:00, 19.95it/s]



Test set: Avg loss: 0.0188, Accuracy: 9940/10000 (99.40%)

Epoch 20


Loss=0.2850 LR=-0.000003 Acc=90.99: 100%|██████████| 469/469 [00:23<00:00, 19.91it/s]



Test set: Avg loss: 0.0190, Accuracy: 9936/10000 (99.36%)

