In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import image

In [None]:
train_df = pd.read_csv('../input/digit-recognizer/train.csv')
test_df = pd.read_csv('../input/digit-recognizer/test.csv')

train_df = train_df.sample(frac=1)

Y_train = train_df.iloc[:-5000, 0].to_numpy()
X_train = train_df.iloc[:-5000,1:].to_numpy()

Y_val = train_df.iloc[-5000:, 0].to_numpy()
X_val = train_df.iloc[-5000:,1:].to_numpy()

X_test = test_df.to_numpy()

# Feature scaling

In [None]:
X_train = X_train / 256
X_train = X_train.reshape((-1, 28, 28, 1))

X_val = X_val / 256
X_val = X_val.reshape((-1, 28, 28, 1))

X_test = X_test / 256
X_test = X_test.reshape((-1, 28, 28, 1))

Y_train = to_categorical(Y_train, 10)
Y_val = to_categorical(Y_val, 10)

# Creating data generator

In [None]:
generator = image.ImageDataGenerator(rotation_range=10, 
                                     width_shift_range=0.1, 
                                     height_shift_range=0.1,  
                                     zoom_range=0.1)
train_flow = generator.flow(X_train, Y_train, batch_size=1000)

# Training separate models

Use reducing learning rate to speed up learning and early stopping for ensemble to avoid overfitting.

In [None]:
reduceLr = ReduceLROnPlateau(monitor='val_accuracy', 
                             patience=3,
                             verbose=0,
                             factor=0.8,
                             min_lr=1e-5)

In [None]:
early_stop = EarlyStopping(monitor='val_accuracy', 
                           verbose=2, 
                           patience=1, 
                           min_delta=1e-4)

In [None]:
def conv_model(input_shape):
    
    X = layers.Input(input_shape)

    Y = X
    for f in [8, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512]:
        Y = layers.Conv2D(f, 3, 1, 'valid')(Y)
        Y = layers.BatchNormalization()(Y)
        Y = layers.ReLU()(Y)

    Y = layers.Flatten()(Y)

    Y = layers.Dense(256)(Y)
    Y = layers.BatchNormalization()(Y)
    Y = layers.ReLU()(Y)
    
    Y = layers.Dense(10)(Y)
    Y = layers.BatchNormalization()(Y)
    Y = layers.Softmax()(Y)
    
    model = Model(inputs=X, outputs=Y)
    
    return model

In [None]:
epochs = 50
steps = X_train.shape[0] // 1000
models = []
for i in range(30):
    model = conv_model((28, 28, 1))
    model.compile('adam', 'categorical_crossentropy', ['accuracy'])
    model.fit(train_flow, 
              epochs=epochs, 
              steps_per_epoch=steps, 
              verbose=0, 
              callbacks=[reduceLr], 
              validation_data=(X_val, Y_val))
    models.append(model)

# Training an ensemble

In [None]:
def model_ansamble(input_shape, models):
    
    X = layers.Input(shape=input_shape)
    
    Y = []
    for model in models:
        model.trainable = False
        Y.append(model(X))
    Y = layers.Add()(Y)
    
    for units in [1024, 256, 64]:
        Y = layers.Dense(units)(Y)
        Y = layers.BatchNormalization()(Y)
        Y = layers.ReLU()(Y)
    
    Y = layers.Dense(10)(Y)
    Y = layers.BatchNormalization()(Y)
    Y = layers.Softmax()(Y)
    
    model = Model(inputs=X, outputs=Y)
    
    return model

In [None]:
model = model_ansamble((28,28,1), models)
model.compile('adam', 'categorical_crossentropy', ['accuracy'])
model.summary()

In [None]:
epochs = 20
steps = 5
model.fit(train_flow, 
          epochs=epochs, 
          steps_per_epoch=steps, 
          verbose=2, 
          callbacks=[early_stop], 
          validation_data=(X_val, Y_val))

# Submitting solution

In [None]:
predictions = model.predict(X_test)
predictions = np.argmax(predictions , axis=1)

In [None]:
submission = pd.read_csv('../input/digit-recognizer/sample_submission.csv')
submission['Label'] = predictions
submission.to_csv('submission.csv', index=False)