# CIFAR10を10分で学習

CIFAR10をColabのGPU環境で10分だけ学習し、テストデータで一番認識率が高くなるものを目指します。

## 初期設定

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.misc import toimage
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Activation, Flatten, Dropout, Add
from tensorflow.keras.layers import Conv2D, MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.datasets import mnist
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.datasets import cifar10

def plot_history(history):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.legend(['acc', 'val_acc'], loc='lower right')
    plt.show()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.legend(['loss', 'val_loss'], loc='lower right')
    plt.show()

def test_cifar10():
    test_num = 10
    start = np.random.randint(X_test.shape[0] - test_num)
    x_test = X_test[start:start+test_num]
    y_test = Y_test[start:start+test_num]

    fig, subplts = plt.subplots(ncols=10, figsize=(20, 2))
    for i in range(10):
        image = x_test[i]
        two_d = (np.reshape(image, (32, 32, 3)) * 255).astype(np.uint8)
        subplts[i].axis('off')
        subplts[i].imshow(two_d, interpolation='nearest')

    print(np.argmax(y_test, axis=1))

    preds = model.predict(x_test)
    print(np.argmax(preds, axis=1))
    
# CIFAR10データ読み込み
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
X_train = x_train.astype('float32')/255.0
X_test = x_test.astype('float32')/255.0
Y_train = to_categorical(y_train, 10)
Y_test = to_categorical(y_test, 10)

## ベースライン(LeNet)

LeNet のCNN  
LeNetは1998にLeCun先生が作られたCNNの直接の先祖となったネットワーク

In [0]:
model = Sequential()

model.add(Conv2D(20, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=SGD(momentum=0.9, nesterov=True),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=20)

In [0]:
plot_history(history)

In [0]:
test_cifar10()
# 0:airplane, 1:automobile, 2:bird, 3:cat, 4:deer, 5:dog, 6:frog, 7:horse, 8:ship, 9:truck

LeNet結果

4.1s/epoch  
150epoch/10min  
68%

## オプティマイザー変更

重みの変更手法をSGDより高速に学習するものを利用します。  

### Adam

よく使われているAdamを利用してみます。  

In [0]:
model = Sequential()

model.add(Conv2D(20, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=5)

In [0]:
plot_history(history)

LeNet-Adam結果

4.1s/epoch  
30epoch程で上限  
70%

### AdaBound

Adam は学習速度が高いが汎化能力がSGDよりも悪くなると言われています。  
AdaBoundという改良手法が最近出て注目されているので試してみましょう。

https://qiita.com/Phoeboooo/items/f610affdcaaae0a28f34

https://github.com/CyberZHG/keras-adabound

importがkerasからになっていてエラーが出るため、tensorflow.kerasにしたものを以下にコピーしてあります。


In [0]:
import tensorflow.keras as keras
import tensorflow.keras.backend as K


class AdaBound(keras.optimizers.Optimizer):
    """AdamBound optimizer.
    # Arguments
        lr: float >= 0. Learning rate.
        final_lr: float >= 0. Final (SGD) learning rate.
        base_lr: float >= 0. Used for loading the optimizer. Do not set the argument manually.
        beta_1: float, 0 < beta < 1. Generally close to 1.
        beta_2: float, 0 < beta < 1. Generally close to 1.
        gamma: float, 0 < gamma < 1. Convergence speed of the bound functions.
        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
        decay: float >= 0. Learning rate decay over each update.
        amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm.
    # References
        - [Adaptive Gradient Methods with Dynamic Bound of Learning Rate]
          (https://openreview.net/forum?id=Bkg3g2R9FX)
    """

    def __init__(self, lr=0.001, final_lr=0.1, base_lr=None,
                 beta_1=0.9, beta_2=0.999, gamma=0.001,
                 epsilon=None, decay=0., amsgrad=False, **kwargs):
        super(AdaBound, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.lr = K.variable(lr, name='lr')
            self.final_lr = K.variable(final_lr, name='final_lr')
            self.beta_1 = K.variable(beta_1, name='beta_1')
            self.beta_2 = K.variable(beta_2, name='beta_2')
            self.gamma = K.variable(gamma, name='gamma')
            self.decay = K.variable(decay, name='decay')
        if epsilon is None:
            epsilon = K.epsilon()
        if base_lr is None:
            self.base_lr = lr
        else:
            self.base_lr = base_lr
        self.epsilon = epsilon
        self.initial_decay = decay
        self.amsgrad = amsgrad

    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))
        final_lr = self.final_lr * lr / self.base_lr
        lower_bound = final_lr * (1.0 - 1.0 / (self.gamma * t + 1.0))
        upper_bound = final_lr * (1.0 + 1.0 / (self.gamma * t))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                step = lr_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                step = lr_t / (K.sqrt(v_t) + self.epsilon)
            p_t = p - K.minimum(K.maximum(step, lower_bound), upper_bound) * m_t

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'lr': float(K.get_value(self.lr)),
                  'final_lr': float(K.get_value(self.final_lr)),
                  'base_lr': self.base_lr,
                  'beta_1': float(K.get_value(self.beta_1)),
                  'beta_2': float(K.get_value(self.beta_2)),
                  'gamma': float(K.get_value(self.gamma)),
                  'decay': float(K.get_value(self.decay)),
                  'epsilon': self.epsilon,
                  'amsgrad': self.amsgrad}
        base_config = super(AdaBound, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
      

In [0]:
model = Sequential()

model.add(Conv2D(20, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=AdaBound(lr=1e-3, final_lr=0.1),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=5)

In [0]:
plot_history(history)

LeNet-AdaBound結果

4.1s/epoch  
30epoch程で上限  
70%

## Batch Normalization の利用

Batch Normalizationは学習時のバックプロパゲーションの値を、バッチ毎によい分布になるように変換します。  
Batch Normalizationを利用することで高速に学習が進み認識率も高まります。

In [0]:
model = Sequential()

model.add(Conv2D(20, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))
model.add(BatchNormalization())

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))
model.add(BatchNormalization())

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))
model.add(BatchNormalization())

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=5)

In [0]:
plot_history(history)

LeNet-BN-Adam結果

5.1s/epoch
30epoch程で上限  
71%

学習データでは10エポック強でほぼ100%になっていることがわかります。

## 層を広くする

特徴を分類するニューロンを増やして、層を広く(wide)することで性能が上がることがあります。  
ここでは中間層のニューロンをすべて4倍にしてみます。

In [0]:
model = Sequential()

model.add(Conv2D(80, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(200, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(2000))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=5)

In [0]:
plot_history(history)

LeNet-4wide-Adam結果

17.3s/epoch  
30epoch程で上限  
73%

## 層を深くする

層を深くすることで表現力を上げることができます。  
基本的にディープラーニングでは深い大きなネットワークにすることで性能が上がってきました。  
しかし単に層を深くしただけではそれほど性能が上がらないこともわかります。

In [0]:
model = Sequential()

model.add(Conv2D(20, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))

model.add(Conv2D(20, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=20)

In [0]:
plot_history(history)

LeNet-2deep-Adam結果

12.2s/epoch  
50epoch/10min  
73%

## Conv2Dを軽量化する

5x5 は 3x3 + 3x3 で範囲が同じになり計算量を減らすことができます。  
さらに 3x3 は 3x1 + 1x3 で同じ範囲になると考えることができます。  

https://www.slideshare.net/ren4yu/deep-neural-network-79382352

In [0]:
model = Sequential()

model.add(Conv2D(20, (3, 3), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))
model.add(Conv2D(20, (3, 3), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(50, (3, 3), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=25)

In [0]:
plot_history(history)

LeNet-3x3+3x3-Adam結果

6.1s/epoch  
30epoch  
71%

In [0]:
model = Sequential()

model.add(Conv2D(20, (1, 3), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))
model.add(Conv2D(20, (3, 1), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(20, (1, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(20, (3, 1), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (1, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(50, (3, 1), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(50, (1, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(50, (3, 1), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=5)

In [0]:
plot_history(history)

LeNet-3x1+1x3-Adam結果

9.2s/epoch  
30epoch  
69%

## ResNet

残差ブロックと言う畳み込みなどの層をスキップした結合を持つことで、非常に深いネットワークでも学習が進むようにしたものです。

https://deepage.net/deep_learning/2016/11/30/resnet.html

Kerasでの実装はFunctional APIを使う必要があるため、他のサンプルとは違った書き方になります。  
ここでは@koshian2氏の実装を元に、わかりやすくするためにループを解いたものを使わせていただいています。

https://qiita.com/koshian2/items/343a55d59d8fdc112661

### 元の深い(24層)ResNet

In [0]:
input = Input(shape=(32, 32, 3))
X = input

# 最初に一度Conv2Dする。分類ニューロン数16
X = Conv2D(16, (3, 3), padding="same")(X)

# ResNet Block 1
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# ResNet Block 2
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# ResNet Block 3
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# 3 ResNet Block毎にAveragePoolingとConv2Dして分類ニューロン数を倍の32に
X = AveragePooling2D((2, 2))(X)
X = Conv2D(32, (3, 3), padding="same")(X)

# ResNet Block 4
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# ResNet Block 5
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# ResNet Block 6
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# 3 ResNet Block毎にAveragePoolingとConv2Dして分類ニューロン数を倍の64に
X = AveragePooling2D((2, 2))(X)
X = Conv2D(64, (3, 3), padding="same")(X)

# ResNet Block 7
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(64, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(64, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# ResNet Block 8
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(64, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(64, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# ResNet Block 9
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(64, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(64, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# 全結線でカテゴライズ
X = Flatten()(X)
y = Dense(10, activation="softmax")(X)
model = Model(inputs=input, outputs=y)


In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=2)

In [0]:
plot_history(history)

ResNet-9blocks 結果

28.5s/epoch  
20epoch/10min    
80%

### 参考用の浅い(12層)ResNet

In [0]:
input = Input(shape=(32, 32, 3))
X = input

# 最初に一度Conv2Dする。分類ニューロン数16
X = Conv2D(16, (3, 3), padding="same")(X)

# ResNet Block 1
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# ResNet Block 2
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(16, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# 2 ResNet Block毎にAveragePoolingとConv2Dして分類ニューロン数を倍の32に
X = AveragePooling2D((2, 2))(X)
X = Conv2D(32, (3, 3), padding="same")(X)

# ResNet Block 3
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# ResNet Block 4
shortcut = X
shortcut = BatchNormalization()(shortcut)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Activation("relu")(X)
X = Conv2D(32, (3, 3), padding="same")(X)
X = BatchNormalization()(X)
X = Add()([X, shortcut])
X = Activation("relu")(X)

# 全結線でカテゴライズ
X = Flatten()(X)
y = Dense(10, activation="softmax")(X)
model = Model(inputs=input, outputs=y)


In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=4)

ResNet-4blocks 結果

26.3s/epoch  
35epoch/10min    
68%

浅いとResNetでも性能が出ないことがわかります。

## バッチサイズを小さくする

バッチサイズが大きいほうが計算速度が速くなります。  
しかしバッチサイズが大きすぎると汎化能力が落ちると言われています。  

https://tech.nikkeibp.co.jp/dm/atcl/mag/15/00144/00002/

In [0]:
model = Sequential()

model.add(Conv2D(20, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=128, epochs=5)

In [0]:
plot_history(history)

LeNet-128batch-Adam結果

8.2s/epoch  
20epoch程で上限  
70%

## Dropout の利用

確率的にニューロンを動かなくさせる（なかったことにする）ことで汎化能力があがる手法です。  
そのかわり学習速度は遅くなります。

In [0]:
model = Sequential()

model.add(Conv2D(20, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=100)

In [0]:
plot_history(history)

LeNet-Dropout-Adam結果

5.1s/epoch  
110epoch/10min  
76%

## データ拡張

ネットワーク自体ではなく、学習させるデータを工夫することで認識率や汎化能力を上げることができます。

### ImageDataGeneratorによるデータ拡張
データを少しずらしたり回転したり拡縮したりなどで少しづつ違うデータで学習することで汎化能力を上げることができます。  
Kerasでは画像のデータ拡張のためのImageDataGeneratorが準備されています。

http://aidiary.hatenablog.com/entry/20161212/1481549365

In [0]:
model = Sequential()

model.add(Conv2D(20, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
# This will do preprocessing and realtime data augmentation:
datagen = ImageDataGenerator(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    zca_epsilon=1e-06,  # epsilon for ZCA whitening
    rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
    # randomly shift images horizontally (fraction of total width)
    width_shift_range=0.1,
    # randomly shift images vertically (fraction of total height)
    height_shift_range=0.1,
    shear_range=0.05,  # set range for random shear
    zoom_range=0.1,  # set range for random zoom
    channel_shift_range=0.,  # set range for random channel shifts
    # set mode for filling points outside the input boundaries
    fill_mode='nearest',
    cval=0.,  # value used for fill_mode = "constant"
    horizontal_flip=True,  # randomly flip images
    vertical_flip=False,  # randomly flip images
    # set rescaling factor (applied before any other transformation)
    rescale=None,
    # set function that will be applied on each input
    preprocessing_function=None,
    # image data format, either "channels_first" or "channels_last"
    data_format=None,
    # fraction of images reserved for validation (strictly between 0 and 1)
    validation_split=0.0)

# Compute quantities required for feature-wise normalization
# (std, mean, and principal components if ZCA whitening is applied).
datagen.fit(X_train)


In [0]:
model.fit_generator(datagen.flow(X_train, Y_train,
                                 batch_size=1024),
                    epochs=25,
                    validation_data=(X_test, Y_test))

In [0]:
plot_history(history)

LeNet-DataAugmentation-Adam結果

23.5s/epoch  
25epoch/10min  
72%

### Mixup / BC-Learning

複数のデータを透過合成してデータ拡張をする手法です。

https://qiita.com/yu4u/items/70aa007346ec73b7ff05

https://qiita.com/koshian2/items/909360f50e3dd5922f32

ここでは@koshian2氏のBC-Learningの実装を使わせていただいています。

https://qiita.com/koshian2/items/d0661842eb66a7c0c0f3


In [0]:
def normal_generator(x, y, batch_size):
    while True:
        indices = np.random.permutation(x.shape[0])
        for i in range(x.shape[0]//batch_size):
            current_indices = indices[i*batch_size:(i+1)*batch_size]
            x_batch = x[current_indices]
            y_batch = y[current_indices]
            yield x_batch, y_batch

def acc(y_true, y_pred):
    true_label = K.argmax(y_true, axis=-1)
    pred_label = K.argmax(y_pred, axis=-1)
    return K.cast(K.equal(true_label, pred_label), "float")

def bclearning_generator(base_generator, batch_size, sample_steps, n_steps):
    assert batch_size >= sample_steps
    assert batch_size % sample_steps == 0
    x_cache, y_cache = [], []
    while True:
        for i in range(n_steps):
            while True:
                current_images, current_onehots = next(base_generator)
                if current_images.shape[0] == sample_steps and current_onehots.shape[0] == sample_steps:
                    break
            current_labels = np.sum(np.arange(current_onehots.shape[1]) * current_onehots, axis=-1)
            for j in range(batch_size//sample_steps):
                for k in range(sample_steps):
                    diff_indices = np.where(current_labels != current_labels[k])[0]
                    mix_ind = np.random.choice(diff_indices)
                    rnd = np.random.rand()
                    if rnd < 0.5: rnd = 1.0 - rnd # 主画像を偏らさないために必要
                    mix_img = rnd * current_images[k] + (1.0-rnd) * current_images[mix_ind]
                    mix_onehot = rnd * current_onehots[k] + (1.0-rnd) * current_onehots[mix_ind]
                    x_cache.append(mix_img)
                    y_cache.append(mix_onehot)
            x_batch = np.asarray(x_cache, dtype=np.float32)
            y_batch = np.asarray(y_cache, dtype=np.float32)
            x_cache, y_cache = [], []
            yield x_batch, y_batch


In [0]:
model = Sequential()

model.add(Conv2D(20, (5, 5), padding='same', input_shape=(32, 32, 3)))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(50, (5, 5), padding='same'))
model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('softmax'))

In [0]:
model.compile(loss="kullback_leibler_divergence",
              optimizer=Adam(),
              metrics=[acc])

In [0]:
batch_size = 128
step_size = 128
base_gen = ImageDataGenerator(
    horizontal_flip=True,
    width_shift_range=4.0/32.0,
    height_shift_range=4.0/32.0
  ).flow(X_train, Y_train, step_size)
train_gen = bclearning_generator(base_gen, batch_size, step_size, X_train.shape[0]//step_size)

history = model.fit_generator(
    train_gen, steps_per_epoch=X_train.shape[0]//step_size,
    validation_data=(X_test, Y_test),
    epochs=2)

In [0]:
K.clear_session()

BC-Learning結果

30s/epoch  
20epoch/10min  
75%

@koshian2氏のBC-Learningのテストで使われている深めのCNN

https://qiita.com/koshian2/items/d0661842eb66a7c0c0f3

In [0]:
input = Input((32,32,3))
x = input
x = Conv2D(64, 3, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Conv2D(64, 3, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Conv2D(64, 3, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
# Conv2Dを3回毎にAveragePoolingを掛けてwideを2倍
x = AveragePooling2D(2)(x)
x = Conv2D(128, 3, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Conv2D(128, 3, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Conv2D(128, 3, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
# Conv2Dを3回毎にAveragePoolingを掛けてwideを2倍
x = AveragePooling2D(2)(x)
x = Conv2D(256, 3, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Conv2D(256, 3, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Conv2D(256, 3, padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
# GlobalAveragePooling後に全結合でカテゴライズ
x = GlobalAveragePooling2D()(x)
x = Dense(10, activation="softmax")(x)
model = Model(input, x)

In [0]:
model.compile(loss="kullback_leibler_divergence",
              optimizer=Adam(),
              metrics=[acc])

In [0]:
batch_size = 128
step_size = 128
base_gen = ImageDataGenerator(
    horizontal_flip=True,
    width_shift_range=4.0/32.0,
    height_shift_range=4.0/32.0
  ).flow(X_train, Y_train, step_size)
train_gen = bclearning_generator(base_gen, batch_size, step_size, X_train.shape[0]//step_size)

history = model.fit_generator(
    train_gen, steps_per_epoch=X_train.shape[0]//step_size,
    validation_data=(X_test, Y_test),
    epochs=2)

参考用元の深いネットワークのもの

53s/epoch  
10epoch/10min  
78%

## KerasのFashion-MNIST用のsample

KerasからTPUを使うためのサンプルでFasion-MNISTを学習するためのサンプル用ネットワークがありますので参考に。  
https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/fashion_mnist.ipynb

In [0]:
model = Sequential()
model.add(BatchNormalization(input_shape=X_train.shape[1:]))
model.add(Conv2D(256, (5, 5), padding='same', activation='elu'))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
model.add(Dropout(0.25))

model.add(BatchNormalization(input_shape=X_train.shape[1:]))
model.add(Conv2D(256, (5, 5), padding='same', activation='elu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(BatchNormalization(input_shape=X_train.shape[1:]))
model.add(Conv2D(256, (5, 5), padding='same', activation='elu'))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256))
model.add(Activation('elu'))
model.add(Dropout(0.5))
model.add(Dense(10))
model.add(Activation('softmax'))


In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    batch_size=1024, epochs=5)

Keras Fashion-MNIST sample結果

40s/epoch  
15epoch/min  
80%

In [0]:
K.clear_session()