In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam, SGD

import torchvision.transforms as T
from torchvision.datasets import FashionMNIST

import time

from ftrl import FTRL, FTRLAdam


DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tfm = T.Compose([
    T.ToTensor()
])
train_set = FashionMNIST(root='./data', train=True, download=True, transform=tfm)
test_set = FashionMNIST(root='./data', train=False, download=True, transform=tfm)
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=12)
test_loader = DataLoader(test_set, batch_size=1024, shuffle=False, num_workers=12)


def train_and_test(model, optimizer, epochs=5):
    loss_function = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        start_time = time.time()
        train_num = 0
        train_right = 0
        train_loss = 0
        model.train()
        for x, y in train_loader:
            num = len(x)
            x, y = x.to(DEVICE), y.to(DEVICE)
            y_hat = model(x)
            loss = loss_function(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_num += num
            train_right += (y_hat.argmax(dim=1) == y).sum().item()
            train_loss += loss.item()
        train_acc = train_right / train_num
        train_loss /= train_num

        with torch.no_grad():
            model.eval()
            test_num = 0
            test_right = 0
            test_loss = 0
            for x, y in test_loader:
                num = len(x)
                x, y = x.cuda(), y.cuda()
                y_hat = model(x)
                loss = loss_function(y_hat, y)
                test_num += num
                test_right += (y_hat.argmax(dim=1) == y).sum().item()
                test_loss += loss.item()
            test_acc = test_right / test_num
            test_loss /= test_num
        print(f"epoch: {epoch}, train_acc: {train_acc * 100:.2f}%, train_loss: {train_loss:.2e}, test_acc: {test_acc * 100:.2f}%, test_loss: {test_loss:.2e}, time used: {time.time() - start_time:.3f}s")


model_cnn = lambda : nn.Sequential(
    nn.Conv2d(1, 16, 5, padding="same"),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Conv2d(16, 16, 5, padding="same"),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(16*7*7, 1024),
    nn.ReLU(),
    nn.Linear(1024, 10)
).to(DEVICE)


In [3]:
print("SGD:")
model = model_cnn()
optimizer = SGD(model.parameters(), lr=1.0e-3, weight_decay=1.0e-5)
train_and_test(model, optimizer)
print("Adam:")
model = model_cnn()
optimizer = Adam(model.parameters(), lr=1.0e-3, weight_decay=1.0e-5)
train_and_test(model, optimizer)
print("FTRL:")
model = model_cnn()
optimizer = FTRL(model.parameters(), lr=1.0e-3, weight_decay=1.0e-5)
train_and_test(model, optimizer)
print("FTRLAdam:")
model = model_cnn()
optimizer = FTRLAdam(model.parameters(), lr=1.0e-3, weight_decay=1.0e-5)
train_and_test(model, optimizer)


SGD:
epoch: 0, train_acc: 33.74%, train_loss: 6.21e-02, test_acc: 61.65%, test_loss: 1.12e-03, time used: 14.568s
epoch: 1, train_acc: 68.23%, train_loss: 2.73e-02, test_acc: 71.87%, test_loss: 7.94e-04, time used: 14.622s
epoch: 2, train_acc: 72.92%, train_loss: 2.32e-02, test_acc: 73.63%, test_loss: 7.18e-04, time used: 14.505s
epoch: 3, train_acc: 75.14%, train_loss: 2.14e-02, test_acc: 75.72%, test_loss: 6.76e-04, time used: 14.484s
epoch: 4, train_acc: 76.72%, train_loss: 2.01e-02, test_acc: 77.32%, test_loss: 6.43e-04, time used: 14.517s
Adam:
epoch: 0, train_acc: 84.11%, train_loss: 1.36e-02, test_acc: 86.47%, test_loss: 3.68e-04, time used: 14.942s
epoch: 1, train_acc: 89.55%, train_loss: 8.83e-03, test_acc: 89.85%, test_loss: 2.75e-04, time used: 14.869s
epoch: 2, train_acc: 91.11%, train_loss: 7.53e-03, test_acc: 90.07%, test_loss: 2.67e-04, time used: 14.849s
epoch: 3, train_acc: 91.99%, train_loss: 6.61e-03, test_acc: 90.10%, test_loss: 2.58e-04, time used: 14.702s
epoch: 4