In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from functools import partial
#import seaborn as sn
from sklearn.model_selection import train_test_split

from random import seed
seed(1)
seed = 43

import tensorflow as tf
from tensorflow import keras
from tensorflow import image
from tensorflow import core
from tensorflow.keras import layers
print("Tensorflow Version: ", tf.__version__)
print("Keras Version: ",keras.__version__)


kaggle = 1 # Kaggle path active = 1

# change your local path here
if kaggle == 1 :
    MNIST_PATH= '../input/digit-recognizer'
else:
    MNIST_PATH= '../Digit_Recognition_with_a_Deep_Neural_Network/data/input/digit-recognizer'



import os
for dirname, _, filenames in os.walk(MNIST_PATH): 
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# Introduction - MNIST Training Competition
This notebook is a fork of my previous developed notebook for digit recognition. Therefore you will find some parts that look common the the notebook <a href="https://www.kaggle.com/skiplik/digit-recognition-with-a-deep-neural-network">Digit Recognition with a Deep Neural Network</a> and some parts that are completely different.

With this I want to take a deeper look in some parts of finetuning hyperparameters. The following list shows some of the finetuning parameters which I will take a look into, one or two ore more ... :
- Dwindling / Exploding Gradients
    - <b>Initializing the Weights</b>
    - <b>Batchnormalization</b>
    - <s>Gradient Clipping</s>
    - <b>Saturated Activataion Functions</b>
- Optimizers
    - <s>Momentum Optimizers</s>
    - <s>Nesterov</s>
    - <s>AdaGrad</s>
    - <s>RMSProp</s>
    - <b>Adam - Optimizer</b>
    - <s>Scheduling Learnrate</s>
- Regulations
    - <s>Drop-Outs</s>
    - <b>l1 / l2 - Regulations</b>
    - <s>Monte-Carlo Drop-out ???</s>
    - <s>Max Norm Regulations ????</s>

Not part of this notebook will be the use of pretrained neural networks (Transferlearning). I just want to list this here for the sake of completeness.

Link to the data topic: https://www.kaggle.com/c/digit-recognizer/data

As in the previous notebooks I will use Tensorflow with Keras. I already mentioned in other notebooks, I will skip some explanations about the data set here. Moreover I will use the already discovered knowledge about the data and transform/prepare the data rightaway.

## Notebook Versions with Different Hyperparameter Configurations
As described in the part above, I used/tested different hyperparameter settings to get a little bit closer to its effects on the neural network and the network's results. I know that there are parameters that effect other parameters when they have changed (and therefore should have been changed as well), however in these cases I just tried a little bit around. Sometimes I kept one or two parameters together, which should be together (e.g. kernel initializer "lecun" and activation function "selu") and sometimes not. The main purpose here was to use them and see the results.

Therefore on Kaggle you can look in the different versions of this notebook if you are interested. In the following I will list some versions with the used hyperparameter config in it:
- Version 7 and 6:
    - Activation Function - "relu"
    - Initializing Weights - "He Normalization"
    - Batchnormalization
- Version 9:
    - Activation Function - "selu"
    - Initializing Weights - "LeCun Normal"
- Version 12 and 14:
    - Regularisation with L1 and L2
- Version 15:
    - Activation Function - "relu"
    - Initializing Weights - "He Normalization"
    - Batchnormalization
    - Optimizer - "Adam"
- Version 18:
    - Activation Function - "relu"
    - Initializing Weights - "He Normalization"
    - Batchnormalization
    - Optimizer - "Adam"

The current best run was based on the Version 16 with an accuracy of 0.97714 on the kaggle competition "Digit Recognzier".

## Tensorflow Data Api for Preprocessing
Since notebook version 18 I use the Tensorflow Data Api for Preprocessing. It helps you a lot with batching and processing a big bunch of data really fast. More interesting here is the fact that the hole part of data processing can be added to the model itself to transport it on mobile devices  later


## My other Projects
If you are interested in some more clearly analysis of the dataset take a look into my other notebooks about the MNIS-dataset:
- Digit Recognition with a Deep Neural Network: https://www.kaggle.com/skiplik/digit-recognition-with-a-deep-neural-network
- Another MNIST Try: https://www.kaggle.com/skiplik/another-mnist-try
- First NN by Detecting Handwritten Characters: https://www.kaggle.com/skiplik/first-nn-by-detecting-handwritten-characters
...




# Get Data

In [None]:
# Data path and file
CSV_FILE_TRAIN='train.csv'
CSV_FILE_TEST='test.csv'

def load_mnist_data(minist_path, csv_file):
    csv_path = os.path.join(minist_path, csv_file)
    return pd.read_csv(csv_path)

def load_mnist_data_manuel(minist_path, csv_file):
    csv_path = os.path.join(minist_path, csv_file)
    csv_file = open(csv_path, 'r')
    csv_data = csv_file.readlines()
    csv_file.close()
    return csv_data

def split_train_val(data, val_ratio):
    return 
    

train = load_mnist_data(MNIST_PATH,CSV_FILE_TRAIN)
test = load_mnist_data(MNIST_PATH,CSV_FILE_TEST)

In [None]:
y = train['label'].copy()
X = train.drop(['label'], axis=1)

# competition dataset
X_test = test.copy()

## Train / Val Split

In [None]:
print("Shape of the Features: ",X.shape)
print("Shape of the Labels: ", y.shape)

### Label Value Count
Visualizing the label distribution of the full train dataset.

In [None]:
train.value_counts('label')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=seed, test_size=0.20
                                                  , stratify=y
                                                 )

Comparing the equally splitted train- and val-sets based on the given label y.

In [None]:
print("Train - Set Distribution")
print(y_train.value_counts() / y_train.value_counts().sum() )
print('--------------------------------------------------------------')
print('--------------------------------------------------------------')
print('--------------------------------------------------------------')
print("Val - Set Distribution")
print(y_val.value_counts() / y_val.value_counts().sum() )


In [None]:
print("X: ", X.shape)
print("X_train: ", X_train.shape)
print("X_val: ", X_val.shape)

print("y_train: ", y_train.shape)
print("y_val: ", y_val.shape)

## Building Transforming Piplines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    #('normalizer', Normalizer())
    ('std_scalar',StandardScaler())
])

## Preprocessing Data

### Data Augmentation with Tensorflow Data Api

In [None]:
def random_crop(image):
    shape = tf.shape(image)
    min_dim = tf.reduce_min([shape[0], shape[1]]) * 85 // 100       # croping to 90% of the initial picture 
    return tf.image.random_crop(image, [min_dim, min_dim, 1])


def crop_flip_resize(image, label, flipping = True):
    if flipping == True:
        cropped_image = random_crop(image)
        cropped_image = tf.image.flip_left_right(cropped_image)
    else:
        cropped_image = random_crop(image)

    ## final solution
    resized_image = tf.image.resize(cropped_image, [28,28])
    final_image = resized_image
    #final_image = keras.applications.xception.preprocess_input(resized_image)
    return final_image, label  
    

In [None]:
X_val.shape

In [None]:
# converting dataframe format into tensorflow compatible format.
X_train = X_train.values.reshape(X_train.shape[0], 28, 28, 1)
X_val = X_val.values.reshape(X_val.shape[0], 28, 28, 1)

X_train_crop = X_train.copy()
X_val_crop = X_val.copy()

In [None]:
# Creating tensorbased dataset 

training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(X_train, tf.float32),
            tf.cast(y_train, tf.int32)
        )
    )
)


val_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
             tf.cast(X_val, tf.float32),
             tf.cast(y_val, tf.int32)
        )
    )
)


training_crop_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(X_train_crop, tf.float32),
            tf.cast(y_train, tf.int32)
        )
    )
)


val_crop_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
             tf.cast(X_val_crop, tf.float32),
             tf.cast(y_val, tf.int32)
        )
    )
)


In [None]:
# resizing, croping images via self build function
training_crop_dataset = training_crop_dataset.map(partial(crop_flip_resize, flipping=False))
val_crop_dataset = val_crop_dataset.map(partial(crop_flip_resize, flipping=False))

In [None]:
# Visualizing a croped, flipped, resized image from new dataset.
for X_values, y_values in training_crop_dataset.take(1):
    for index in range(1):
        plt.imshow(X_values)

In [None]:
# concate the two datasets
training_dataset_all = training_dataset.concatenate(training_crop_dataset)
val_dataset_all = val_dataset.concatenate(val_crop_dataset)

In [None]:
print("training_dataset_all length: ", len(list(training_dataset_all)))
print("val_dataset_all length: ", len(list(val_dataset_all)))


In [None]:
# shuffeling and batching data
tf.random.set_seed(seed)

train_ds = training_dataset_all.shuffle(10000).batch(32).prefetch(1)
val_ds = val_dataset_all.shuffle(8000).batch(32).prefetch(1)

# Building a Deep Neural Network

## Preparing Model Visualization with Tensorboard (not for Kaggle)

In [None]:
root_logdir = "../../tensorboard-logs"

print("Relative root_logdir: ",root_logdir)

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir,run_id)

In [None]:
run_logdir = get_run_logdir()
print("Current run logdir for Tensorboard: ", run_logdir)

In [None]:
run_logdir

### Keras Callbacks for Tensorboard
With Keras there is a way of using Callbacks for the Tensorboard to write log files for the board and visualize the different graphs (loss and val curve)


In [None]:
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

## Building Model Architecture

In [None]:
from keras.layers import LeakyReLU

input_shape=[784]
input_shape_notFlattened=[28,28]


learning_rt = 1e-03 
activation_fn = "relu"
initializer = "he_normal"
regularizer =  None #keras.regularizers.l2(0.01)

# Model building
model = keras.models.Sequential()

model.add(keras.layers.Flatten(input_shape=input_shape_notFlattened))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(500, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer)) ## add  kernel_regularizer=keras.regularizers.l2(0.01)) ???
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(500, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(300, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(300, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(300, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(300, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))

model.add(keras.layers.Dense(10, activation="softmax", kernel_initializer="glorot_uniform"))


optimizer = keras.optimizers.Adam(learning_rate=learning_rt)


model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'] )


In [None]:
model.summary()

## Model Checkpoints

In [None]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("my_train_model.h5", save_best_only=True, save_weights_only=False)

## Model Training

In [None]:
history = model.fit(train_ds, epochs=200, validation_data=(val_ds), callbacks=[checkpoint_cb, keras.callbacks.EarlyStopping(patience=20), tensorboard_cb])

## Visualizing the Progress

In [None]:
plt.plot(pd.DataFrame(history.history))
plt.show()

### Model Training with Full Dataset 
In this part I will train the model with the full dataset. This time I will use the discovered hyperparameters from previous section.

In [None]:

# Model building
model_full = keras.models.Sequential()

model_full.add(keras.layers.Flatten(input_shape=input_shape_notFlattened))
model_full.add(keras.layers.BatchNormalization())
model_full.add(keras.layers.Dense(500, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer)) ## add  kernel_regularizer=keras.regularizers.l2(0.01)) ???
model_full.add(keras.layers.BatchNormalization())
model_full.add(keras.layers.Dense(500, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))
model_full.add(keras.layers.BatchNormalization())
model_full.add(keras.layers.Dense(300, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))
model_full.add(keras.layers.BatchNormalization())
model_full.add(keras.layers.Dense(300, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))
model_full.add(keras.layers.BatchNormalization())
model_full.add(keras.layers.Dense(300, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))
model_full.add(keras.layers.BatchNormalization())
model_full.add(keras.layers.Dense(300, activation=activation_fn, kernel_initializer=initializer, kernel_regularizer= regularizer))

model_full.add(keras.layers.Dense(10, activation="softmax", kernel_initializer="glorot_uniform"))


optimizer = keras.optimizers.SGD(learning_rate=learning_rt)


model_full.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'] )

In [None]:
model_full.summary()

In [None]:
# creating a new log dir for tensorboard
tensorboard_cb_f = keras.callbacks.TensorBoard(get_run_logdir())
checkpoint_cb_f = keras.callbacks.ModelCheckpoint("my_modell_full.h5", save_best_only=False, save_weights_only=False)

In [None]:
# preparing full features set (X) for the tensorflow data api

training_dataset_all = training_dataset.concatenate(training_crop_dataset)
val_dataset_all = val_dataset.concatenate(val_crop_dataset)

training_ds_all = training_dataset_all.concatenate(val_dataset_all)

training_ds_all = training_ds_all.shuffle(20000).batch(32).prefetch(1)

In [None]:
# Train the model again pleeeeease with all you got .... especially the new transformed data matrix X 
history_full = model_full.fit(training_ds_all, epochs=60, callbacks=[tensorboard_cb_f, checkpoint_cb_f])

In [None]:
plt.plot(pd.DataFrame(history_full.history))
plt.show()

# Image Prediction of Unknown Data (Test Data)

## Peparing Test Data
As well as previously done, we need to create a TF dataset of the test set as well.

In [None]:
# converting dataframe format into tensorflow compatible format.
X_test = X_test.values.reshape(X_test.shape[0], 28, 28, 1)


test_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(X_test, tf.float32)
        )
    )
)

In [None]:
test_dataset

In [None]:
test_ds = test_dataset.batch(32).prefetch(1)

## Creating Competition File

In [None]:
mnist_competition_file = pd.DataFrame(columns=['ImageId','Label'])

## Prediction of Testdata

In [None]:
# Visualizing the image
plt.figure(figsize=(12, 12))
for X_batch in test_ds.take(1):
    for index in range(1):
        plt.subplot(3, 3, index + 1)
        plt.imshow(X_batch[index])

plt.show()

In [None]:
for element in test_ds.take(1):
    print("Propability of all lables for given pixels: ", model_full.predict(test_ds.take(1))[0])

In [None]:
print("Predicted Digit: ",np.argmax(model_full.predict(test_ds.take(1))[0]))

In [None]:
predictions = model_full.predict(test_ds)                                                                           # predict the probability
predictions = np.argmax(predictions, axis=1)                                                                        # getting the predicted digit numbers based ont the probability of every np element 
mnist_competition_file = pd.DataFrame(predictions)                                                                  # converting into df
mnist_competition_file.index += 1                                                                                   # index should start at 1
mnist_competition_file.reset_index(level=0, inplace=True)                                                           # make the index a column 
mnist_competition_file = mnist_competition_file.rename(columns={"index": "ImageId", 0: "Label"}, errors="raise")    # renamen them according to the competition requirements

In [None]:
mnist_competition_file

In [None]:
mnist_competition_file.ImageId = mnist_competition_file.ImageId.astype(int)
mnist_competition_file.Label = mnist_competition_file.Label.astype(int)

In [None]:
mnist_competition_file.to_csv('mnist_submission.csv', index=False)