In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam, SGD

import torchvision.transforms as T
from torchvision.datasets import FashionMNIST

import time

from ftrl import FTRL, FTRLAdam


DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tfm = T.Compose([
    T.ToTensor()
])
train_set = FashionMNIST(root='./data', train=True, download=True, transform=tfm)
test_set = FashionMNIST(root='./data', train=False, download=True, transform=tfm)
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=12)
test_loader = DataLoader(test_set, batch_size=1024, shuffle=False, num_workers=12)


def train_and_test(model, optimizer, epochs=5):
    loss_function = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        start_time = time.time()
        train_num = 0
        train_right = 0
        train_loss = 0
        model.train()
        for x, y in train_loader:
            num = len(x)
            x, y = x.to(DEVICE), y.to(DEVICE)
            y_hat = model(x)
            loss = loss_function(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_num += num
            train_right += (y_hat.argmax(dim=1) == y).sum().item()
            train_loss += loss.item()
        train_acc = train_right / train_num
        train_loss /= train_num

        with torch.no_grad():
            model.eval()
            test_num = 0
            test_right = 0
            test_loss = 0
            for x, y in test_loader:
                num = len(x)
                x, y = x.cuda(), y.cuda()
                y_hat = model(x)
                loss = loss_function(y_hat, y)
                test_num += num
                test_right += (y_hat.argmax(dim=1) == y).sum().item()
                test_loss += loss.item()
            test_acc = test_right / test_num
            test_loss /= test_num
        print(f"epoch: {epoch}, train_acc: {train_acc * 100:.2f}%, train_loss: {train_loss:.2e}, test_acc: {test_acc * 100:.2f}%, test_loss: {test_loss:.2e}, time used: {time.time() - start_time:.3f}s")


model_cnn = lambda : nn.Sequential(
    nn.Conv2d(1, 16, 5, padding="same"),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Conv2d(16, 16, 5, padding="same"),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(16*7*7, 1024),
    nn.ReLU(),
    nn.Linear(1024, 10)
).to(DEVICE)


In [None]:
print("SGD:")
model = model_cnn()
optimizer = SGD(model.parameters(), lr=1.0e-3, weight_decay=1.0e-5)
train_and_test(model, optimizer)
print("Adam:")
model = model_cnn()
optimizer = Adam(model.parameters(), lr=1.0e-3, weight_decay=1.0e-5)
train_and_test(model, optimizer)
print("FTRL:")
model = model_cnn()
optimizer = FTRL(model.parameters(), lr=1.0e-3, weight_decay=1.0e-5)
train_and_test(model, optimizer)
print("FTRLAdam:")
model = model_cnn()
optimizer = FTRLAdam(model.parameters(), lr=1.0e-3, weight_decay=1.0e-5)
train_and_test(model, optimizer)


SGD:
epoch: 0, train_acc: 31.10%, train_loss: 6.95e-02, test_acc: 58.53%, test_loss: 1.93e-03, time used: 16.103s
epoch: 1, train_acc: 65.32%, train_loss: 3.40e-02, test_acc: 70.81%, test_loss: 8.11e-04, time used: 15.754s
epoch: 2, train_acc: 73.00%, train_loss: 2.33e-02, test_acc: 74.59%, test_loss: 7.11e-04, time used: 15.819s
epoch: 3, train_acc: 75.47%, train_loss: 2.12e-02, test_acc: 75.62%, test_loss: 6.76e-04, time used: 15.572s
epoch: 4, train_acc: 76.74%, train_loss: 2.00e-02, test_acc: 76.15%, test_loss: 6.44e-04, time used: 15.963s
Adam:
epoch: 0, train_acc: 84.39%, train_loss: 1.33e-02, test_acc: 87.30%, test_loss: 3.42e-04, time used: 15.981s
epoch: 1, train_acc: 89.68%, train_loss: 8.68e-03, test_acc: 90.01%, test_loss: 2.73e-04, time used: 16.443s
epoch: 2, train_acc: 91.27%, train_loss: 7.33e-03, test_acc: 90.55%, test_loss: 2.61e-04, time used: 15.610s
epoch: 3, train_acc: 92.22%, train_loss: 6.48e-03, test_acc: 90.97%, test_loss: 2.46e-04, time used: 17.129s
epoch: 4