### In this is notebook i used simple NeuralNetwork provided by Keras and some cheat for getting more data without further data processing 

## Read and normalize data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical 
import time

In [None]:
train = pd.read_csv('../input/digit-recognizer/train.csv')
test = pd.read_csv('../input/digit-recognizer/test.csv')

In [None]:
print('Train shape:', train.shape)
print('Test shape:', test.shape)

In [None]:
y = train['label']
y = to_categorical(y, num_classes=10)

In [None]:
train = train.iloc[:,1:].values.reshape(-1,28,28,1)
test = test.iloc[:,:].values.reshape(-1,28,28,1)

In [None]:
for i in range(9):
    plt.subplot(330 + 1 + i)
    plt.imshow(train[i], cmap=plt.get_cmap('gray'))

In [None]:
train = train.reshape((train.shape[0], 28*28)).astype('float64') / 255
test = test.reshape((test.shape[0], 28*28)).astype('float64') / 255

In [None]:
print('Train shape:', train.shape)
print('Test shape:', test.shape)

## Split data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val , y_train, y_val = train_test_split(train, y, test_size=0.1, random_state=42)


## Building NN architecture

#### Three layer NN with Dropout for regularization and Batch Normalization for optimizing algorithm. 

In [None]:
from tensorflow.keras import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
import tensorflow as tf
model = Sequential()
model.add(Dense(256, input_dim=784, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(10, activation='softmax'))

## Training

### For training i chose Adam with lr=0.001, because it gave best results with my NN architecture.

In [None]:
optimizer = tf.keras.optimizers.Adam(0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print(optimizer.learning_rate)

In [None]:
start = time.time()
history = model.fit(x_train, y_train, epochs=45, batch_size=64, validation_data = (x_val,y_val))
end = time.time()
print("Algorithm time is {} s:".format(round(end-start)))

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,10)
fig, ax = plt.subplots(2,1)
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

In [None]:
predictions = model.predict(test)
predictions = np.argmax(predictions , axis=1)

### If we upload this predictions we will get something like ~0.979 accuracy and as we see in our plots, our model is quiet overfitted. Let's fix it and give our model more data, cheat data :)

## Load more data

In [None]:
from keras.datasets import mnist
from keras.utils.np_utils import to_categorical
(x_train_1, y_train_1), (x_val_1, y_val_1) = mnist.load_data()
print('Train: X=%s, y=%s' % (x_train_1.shape, y_train_1.shape))
print('Test: X=%s, y=%s' % (x_val_1.shape, y_val_1.shape))

In [None]:
x_train_1 = x_train_1.reshape((x_train_1.shape[0], 28*28)).astype('float64') / 255
x_val_1 = x_val_1.reshape((x_val_1.shape[0], 28*28)).astype('float64') / 255
y_train_1 = to_categorical(y_train_1)
y_val_1 = to_categorical(y_val_1)

In [None]:
#x_train, x_val_1 , y_train, y_val_1 = train_test_split(train, y, test_size=0.8, random_state=42)

In [None]:
new_x_train = np.concatenate((x_train_1, x_val_1), axis=0)
new_y_train = np.concatenate((y_train_1, y_val_1), axis=0)
#new_x_train = np.concatenate((new_x_train, x_train), axis=0)
#new_y_train = np.concatenate((new_y_train, y_train), axis=0)

In [None]:
print('new_x_train shape:', new_x_train.shape)
print('new_y_train shape:', new_y_train.shape)

### Now for splitting data we will give our mnist dataset from keras to train and old data from this competition to validate.

In [None]:
#x_val, x_test , y_val, y_test = train_test_split(train, y, test_size=0.15, random_state=42)

In [None]:
#reduce lr
optimizer = tf.keras.optimizers.Adam(0.0005)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print(optimizer.learning_rate)
start = time.time()
history = model.fit(new_x_train, new_y_train, epochs=60, batch_size=64, validation_data = (train,y))
end = time.time()
print("Algorithm time is {} s:".format(round(end-start)))

### As wee see our accuracy and loss are very good, the same thing with validation data

In [None]:
#model.evaluate(x_test, y_test)

In [None]:
#result = model.predict(x_test)

In [None]:
#result = np.argmax(result , axis=1)
#y_test = np.argmax(y_test , axis=1)

### Errors in images

In [None]:

# for result, y_test in zip(result, y_test):
#     if result != y_test:
#         print(y_test, 'has been classified as ', result)

In [None]:
predictions = model.predict(test)
predictions = np.argmax(predictions , axis=1)

In [None]:
submission=pd.read_csv('../input/digit-recognizer/sample_submission.csv')
submission['Label']=predictions
submission.to_csv('submission.csv', index=False)