In [None]:
!pip install keras-tuner==1.0.1

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import seaborn as sns 
import matplotlib.pyplot as plt
import random
import tensorflow as tf
import kerastuner

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
tf.__version__

In [None]:
#setting the random seed so the models will always start using the same weights
tf.random.set_seed(42)

# 1. Loading and looking the data:

In [None]:
train = pd.read_csv(r'/kaggle/input/digit-recognizer/train.csv')
test = pd.read_csv(r'/kaggle/input/digit-recognizer/test.csv')

In [None]:
y_train = train['label']
X_train = train.drop('label', axis=1)

In [None]:
y_train.head()

In [None]:
X_train.head()

In [None]:
X_train.info()

In [None]:
X_train = X_train/255

In [None]:
y_train.nunique()

# 2. Exploratory analysis:

In [None]:
sns.countplot(x=y_train)
plt.title('# Of samples', size=14)
plt.show()

In [None]:
fig, ax = plt.subplots(5, 5, figsize=(8, 8))
fig.suptitle('Digits images and labels', fontsize=16)
ax = ax.ravel()
for i in range(25):
    sample_n = random.randint(0, X_train.shape[0])
    ax[i].imshow(np.array(X_train.iloc[sample_n]).reshape(28, 28), cmap='inferno')
    ax[i].get_xaxis().set_visible(False)
    ax[i].get_yaxis().set_visible(False)
    ax[i].set_title(y_train[sample_n], fontsize = 12)

plt.subplots_adjust(hspace=0.3)

if you make an average of these pixels you can look which pixels are more commonly painted, also deppending on how you look to this image you can see different numbers (the most simple are 9 and 3)

In [None]:
plt.imshow(np.array(X_train.mean()).reshape(28, 28), cmap='inferno')
plt.colorbar()
plt.title('average shape', {'fontsize': 16})
plt.show()

In [None]:
fig, ax = plt.subplots(2, 5, figsize=(8, 4))
fig.suptitle('Average shape per digit', fontsize=16)

ax = ax.ravel()

for i in range(10):
    ax[i].imshow(np.array(train[train['label'] == i].drop('label', axis=1).mean()).reshape(28, 28), cmap='inferno')
    ax[i].get_xaxis().set_visible(False)
    ax[i].get_yaxis().set_visible(False)
    ax[i].set_title(i, fontsize = 12)

## transforming Data:
CNN are sometimes more capable of learning certain data, specially visual data this happens since it have a convolution and a pooling process.

In [None]:
X_train_array = np.array(X_train)
X_train_array

In [None]:
test_array = np.array(test)
test_array

In [None]:
X_train_array.shape

In [None]:
test_array.shape

In [None]:
X_train_array = X_train_array.reshape(42000, 28, 28, 1)
X_train_array.shape

In [None]:
test_array = test_array.reshape(-1, 28, 28, 1)
test_array.shape

# 3. Setting Data augmentation:

Data augmentation is a technic that maks new data from the input data by making small changes on the original data (shifting, rotating, zooming... the image)

In [None]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    width_shift_range=0.2,
    height_shift_range=0.2,
    fill_mode="constant",
    
    horizontal_flip=False,
    vertical_flip=False,
    
    rotation_range=20,
    
    validation_split=0.2
)

In [None]:
datagen.fit(X_train_array)

# 4. Training model(s):

In [None]:
tf.random.set_seed(42)

In [None]:
def create_model(hp=None):
    #hyperparameters
    num_dense_layers = 4
    num_units = [784, 392, 151, 50]
    dropout_rate=0.2
    learning_rate=0.01
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07,
    amsgrad=False
    
    if(hp):
        #Model hyperparameters
        num_dense_layers=hp.Choice('num_dense_layers', [1, 2, 3, 4, 5, 6, 7, 8])
        num_units = []
        
        for _ in range(num_dense_layers):
            num_units.append(hp.Choice('num_units', [784, 392, 151, 50]))
            
        dropout_rate=hp.Float('dropout_rate', min_value=0.1, max_value=0.5)
        
        #Adams hyperparameters
        learning_rate=hp.Float('learning_rate', min_value=1e-6, max_value=0.01)
        beta_1=hp.Float('beta_1', min_value=0.5, max_value=0.99)
        beta_2=hp.Float('beta_2', min_value=0.9, max_value=0.9999)
        epsilon=hp.Float('epsilon', min_value=1e-9, max_value=1e-6)
        amsgrad=hp.Choice('amsgrad', [True, False])
        
    model = tf.keras.Sequential()
    
    #convolution_layers
    model.add(tf.keras.layers.Conv2D(32, (5, 5), activation='relu', input_shape=(28, 28, 1)))
    model.add(tf.keras.layers.MaxPooling2D((3, 3)))
    model.add(tf.keras.layers.Dropout(dropout_rate))
    
    model.add(tf.keras.layers.Conv2D(48, (3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D((3, 3)))
    model.add(tf.keras.layers.Dropout(dropout_rate))
    
    model.add(tf.keras.layers.Flatten())
    
    for nu, _ in enumerate(range(num_dense_layers)):
        model.add(tf.keras.layers.Dense(units=num_units[nu], activation='relu', input_shape=(784, )))
        model.add(tf.keras.layers.Dropout(dropout_rate))

    model.add(tf.keras.layers.Dense(units=10, activation='softmax'))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(
        learning_rate=learning_rate,
        beta_1=beta_1,
        beta_2=beta_2,
        epsilon=epsilon,
        amsgrad=amsgrad),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])
    
    return model

In [None]:
model = create_model()

In [None]:
model.summary()

In [None]:
class MyTuner(kerastuner.tuners.BayesianOptimization):
    def run_trial(self, trial, *args, **kwargs):
        kwargs['batch_size'] = trial.hyperparameters.Int('Batch_size', 32, 256, step=32)
        super(MyTuner, self).run_trial(trial, *args, **kwargs)

In [None]:
tuner=MyTuner(
    create_model,
    objective='val_accuracy',
    max_trials=20,
    directory='./',
    project_name='digits',
    overwrite=True
)

In [None]:
tuner.search_space_summary()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train_split, x_dev, y_train_split, y_dev = train_test_split(X_train_array, y_train, test_size=0.3, random_state=42)

In [None]:
tuner.search(
    x_train_split, y_train_split,
    validation_data=(x_dev, y_dev),
    epochs=5,
    verbose=False
)

In [None]:
tuner.results_summary(1)

In [None]:
model = tuner.get_best_models(num_models=1)[0]

In [None]:
model.summary()

In [None]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Conv2D(32, (5, 5), activation='relu', input_shape=(28, 28, 1)))
model.add(tf.keras.layers.MaxPooling2D((3, 3)))
model.add(tf.keras.layers.Dropout(0.27144151078096934))
    
model.add(tf.keras.layers.Conv2D(48, (3, 3), activation='relu'))
model.add(tf.keras.layers.MaxPooling2D((3, 3)))
model.add(tf.keras.layers.Dropout(0.27144151078096934))
    
model.add(tf.keras.layers.Flatten())

model.add(tf.keras.layers.Dense(units=151, activation='relu'))
model.add(tf.keras.layers.Dropout(0.27144151078096934))

model.add(tf.keras.layers.Dense(units=151, activation='relu'))
model.add(tf.keras.layers.Dropout(0.27144151078096934))

model.add(tf.keras.layers.Dense(units=10, activation='softmax'))

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005783725692573537,
beta_1=0.5050154442517982,
beta_2=0.9041756976654143,
epsilon=6.826172296168995e-07,
amsgrad=False
), loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
es = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0.008,
    patience=10,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)

In [None]:
history = model.fit(datagen.flow(X_train_array, y_train, batch_size=160, subset='training'),
                    epochs=1000,
                    validation_data=datagen.flow(X_train_array, y_train,
         batch_size=64, subset='validation'),
                    callbacks=[es])

i'll also train the network without using the augmented data, so it'll be able to generalize more.

In [None]:
model.fit(X_train_array,
        y_train,
        batch_size=160,
        epochs=20,
        validation_split=0.3,
        callbacks=[es])

# 5.1. Evaluating model performance:

In [None]:
history.history.keys()

In [None]:
plt.title('Model loss', fontsize = 14)
plt.plot(range(len(history.history['loss'])), history.history['loss'], marker='o', c='gray')
plt.plot(range(len(history.history['val_loss'])), history.history['val_loss'], marker='o')
plt.legend(labels=['training loss', 'validation loss'])
plt.show()

In [None]:
plt.title('Model accuracy', fontsize = 14)
plt.plot(range(len(history.history['loss'])), history.history['accuracy'], c='gray', marker='o')
plt.plot(range(len(history.history['loss'])), history.history['val_accuracy'], marker='o')
plt.legend(labels=['training accuracy', 'validation accuracy'])
plt.show()

In [None]:
ev = model.evaluate(X_train_array, y_train)
ev

In [None]:
pred = model.predict(X_train_array)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
y_train.unique()

In [None]:
pred = np.array(pd.DataFrame(pred).idxmax(axis=1))

In [None]:
print(classification_report(y_train, pred))

In [None]:
plt.figure(figsize=(8, 6))
plt.title('Predicted digits', size=14)
sns.heatmap(confusion_matrix(y_train, pred), cmap='inferno', annot=True, )
plt.show()

# 6. Predicting test data:

In [None]:
test_array = test_array / 255

In [None]:
test_result = model.predict(test_array)

In [None]:
test_result = np.array(pd.DataFrame(test_result).idxmax(axis=1))

In [None]:
fig, ax = plt.subplots(5, 5, figsize=(8, 8))
fig.suptitle('Digits images and labels', fontsize=16)
ax = ax.ravel()
for i in range(25):
    sample_n = random.randint(0, test.shape[0])
    ax[i].imshow(np.array(test.iloc[sample_n]).reshape(28, 28), cmap='inferno')
    ax[i].get_xaxis().set_visible(False)
    ax[i].get_yaxis().set_visible(False)
    ax[i].set_title(test_result[sample_n], fontsize = 12)

plt.subplots_adjust(hspace=0.3)
fig.show()

In [None]:
predicted_data = pd.DataFrame({
    'ImageId': test.index+1,
    'Label': test_result
})

In [None]:
predicted_data

In [None]:
predicted_data.to_csv('submission.csv', index=False)