# Schedulers for beginners using Chinesse MNIST

# Definition


Schedulers are used to adjust the learning rate explicitly each learning step. This is conveniently
achieved by the set_learning_rate method. We could adjust it downward after every epoch (or
even after every minibatch), e.g., in a dynamic manner in response to how optimization is pro-
gressing.

# Prepare Data

In [None]:
import pandas as pd
import numpy
import cv2

from PIL import Image
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [None]:
path = '../input/chinese-mnist/data/data/'

df = pd.read_csv('../input/chinese-mnist/chinese_mnist.csv')
df.head()

In [None]:
char_values = numpy.unique(df['character'].values)
idx_to_character = {i:c for i,c in enumerate(char_values)}
character_to_idx = {c:i for i,c in enumerate(char_values)}

In [None]:
# install d2l for visaulization
!pip install --quiet d2l

In [None]:
from d2l import mxnet as d2l
from mxnet import autograd, gluon, init, lr_scheduler, np, npx
import mxnet as mx
from mxnet.gluon import nn
npx.set_np()

In [None]:
# index extracted: suite_id: 1, sample_id: 3, code: 4
# resulted file name: input_1_3_4.jpg
features = []
labels = []

for i in range(len(df)):
    image_path = path + 'input_' + str(df.iloc[i][0]) + "_" + str(df.iloc[i][1]) + "_" + str(df.iloc[i][2]) + ".jpg"
    image_arr = cv2.imread(image_path)
    features.append(image_arr)
    labels.append(character_to_idx[df.iloc[i]['character']])
    
features = np.array(features)
labels = np.array(labels)

In [None]:
# shuffle
features, labels = shuffle(features, labels, random_state=0)

In [None]:
# print one label and the image
print('Label:', idx_to_character[int(labels[0])])

Image.fromarray(features[0].asnumpy().astype('uint8'), 'RGB')

In [None]:
# normalize
features = features / 255.

In [None]:
# move depth
features = np.moveaxis(features, 3, 1)

In [None]:
# split between train and test
X_train, y_train = features[:12000], labels[:12000]
X_test, y_test = features[12000:], labels[12000:]

In [None]:
dataset_train = mx.gluon.data.dataset.ArrayDataset(X_train, y_train)
train_iter = gluon.data.DataLoader(dataset_train, batch_size=128, shuffle=True, num_workers=2)

dataset_test = mx.gluon.data.dataset.ArrayDataset(X_test, y_test)
test_iter = gluon.data.DataLoader(dataset_test, batch_size=128, shuffle=True, num_workers=2)

# Model

In [None]:
net = nn.HybridSequential()
net.add(nn.Conv2D(channels=6, kernel_size=5, padding=2, activation='relu'),
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Conv2D(channels=16, kernel_size=5, activation='relu'),
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Dense(120, activation='relu'),
        nn.Dense(84, activation='relu'),
        nn.Dense(10))
net.hybridize()

# Training without Scheduler

In [None]:
loss = gluon.loss.SoftmaxCrossEntropyLoss()
device = d2l.try_gpu()

In [None]:
def train(net, train_iter, test_iter, num_epochs, loss, trainer, device):
    net.initialize(force_reinit=True, ctx=device, init=init.Xavier())
    animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs],
    legend=['train loss', 'train acc', 'test acc'])
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3) # train_loss, train_acc, num_examples
        for i, (X, y) in enumerate(train_iter):
            X, y = X.as_in_ctx(device), y.as_in_ctx(device)
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y)
            l.backward()
            trainer.step(X.shape[0])
            metric.add(l.sum(), d2l.accuracy(y_hat, y), X.shape[0])
            train_loss = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i + 1) % 50 == 0:
                animator.add(epoch + i / len(train_iter),
                            (train_loss, train_acc, None))
        test_acc = d2l.evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
    print(f'train loss {train_loss:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')

In [None]:
lr, num_epochs = 0.3, 30
net.initialize(force_reinit=True, ctx=device, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})

In [None]:
train(net, train_iter, test_iter, num_epochs, loss, trainer, device)

# Training with Schedulers

## Square Root Scheduler

In [None]:
class SquareRootScheduler:
    def __init__(self, lr=0.1):
        self.lr = lr
    def __call__(self, num_update):
        return self.lr * pow(num_update + 1.0, -0.5)

In [None]:
trainer.set_learning_rate(0.1)
print(f'learning rate is now {trainer.learning_rate:.2f}')

In [None]:
scheduler = SquareRootScheduler(lr=0.1)
d2l.plot(np.arange(num_epochs), [scheduler(t) for t in range(num_epochs)])

In [None]:
trainer = gluon.Trainer(net.collect_params(), 'sgd',
                        {'lr_scheduler': scheduler})
train(net, train_iter, test_iter, num_epochs, loss, trainer, device)

## Factor Scheduler

In [None]:
class FactorScheduler:
    def __init__(self, factor=1, stop_factor_lr=1e-7, base_lr=0.1):
        self.factor = factor
        self.stop_factor_lr = stop_factor_lr
        self.base_lr = base_lr

    def __call__(self, num_update):
        self.base_lr = max(self.stop_factor_lr, self.base_lr * self.factor)
        return self.base_lr

In [None]:
scheduler = FactorScheduler(factor=0.9, stop_factor_lr=1e-2, base_lr=2.0)
d2l.plot(np.arange(50), [scheduler(t) for t in range(50)])

In [None]:
trainer = gluon.Trainer(net.collect_params(), 'sgd',
                        {'lr_scheduler': scheduler})
train(net, train_iter, test_iter, num_epochs, loss, trainer, device)

## Multi Factor Scheduler

In [None]:
scheduler = lr_scheduler.MultiFactorScheduler(step=[15, 30], factor=0.5,
                                              base_lr=0.5)
d2l.plot(np.arange(num_epochs), [scheduler(t) for t in range(num_epochs)])

In [None]:
trainer = gluon.Trainer(net.collect_params(), 'sgd',
                        {'lr_scheduler': scheduler})
train(net, train_iter, test_iter, num_epochs, loss, trainer, device)

## Cosine Scheduler

In [None]:
scheduler = lr_scheduler.CosineScheduler(max_update=20, base_lr=0.3,
                                         final_lr=0.01)
d2l.plot(np.arange(num_epochs), [scheduler(t) for t in range(num_epochs)])

In [None]:
trainer = gluon.Trainer(net.collect_params(), 'sgd',
                        {'lr_scheduler': scheduler})
train(net, train_iter, test_iter, num_epochs, loss, trainer, device)