In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Generate Datasets from CSV files
This function is running too slow as I'm going through each of the CSV. With some some reshaping I could probably make it faster

In [None]:
def generateData(filepath):
    file = pd.read_csv(filepath)
    
    imageDataset = []
    # only the train dataset has labels, take labels and turn it into a list
    if 'train' in filepath:
        labelDataset = file.loc[:,'label'].values.tolist()
    else:
        labelDataset = []
    
    #go through each line of the file
    for i in range(len(file)):        
        #the test csv does not contain a label column so we have to start from the first column
        start = 1
        if 'test' in filepath:
            start = 0
        currentLine = file.loc[i]
        image = np.array(currentLine[start:len(file.loc[i])])
        #reshape the 1D aray into a 2D matrix and normalize the data
        image = image.reshape(28,28)
        image = image / 255
        imageDataset.append(image)
     
    #turn the lists into numpy arrays and expand their dimensions because keras is picky
    imageDataset = np.array(imageDataset)
    labelDataset = np.array(labelDataset)
    imageDataset = np.expand_dims(imageDataset, -1)
    labelDataset = np.expand_dims(labelDataset, -1)
    print('Images shape and amount: ', type(imageDataset), imageDataset.shape)
    print('Labels shape and amount: ', type(labelDataset), labelDataset.shape)
    return imageDataset, labelDataset
trainImages, trainLabels = generateData(r'/kaggle/input/digit-recognizer/train.csv')

In [None]:
print(trainLabels[:10])

# Show random images

We have the images and labels in the right shape to let's display some random images to see if they are still correct. Shuffeling manually is not necessary because keras can do that for us when we fit.

The results look good, all labels are still correct.

In [None]:
import matplotlib.pyplot as plt
import random as rd

#rd.seed(42)
#rd.shuffle(trainImages)
#rd.seed(42)
#rd.shuffle(trainLabels)

fig = plt.figure(figsize=(10, 10))
#rd.seed(None)
for i in range(16):
    fig.add_subplot(4, 4, i+1)
    rdNumber = rd.randint(0,42000)
    plt.title(trainLabels[rdNumber][0])
    plt.imshow(trainImages[rdNumber], cmap='Greys_r')

# Build model
Simple convnet, I took a look at the VGG16 architecture and noticed that they use two Conv2D layers after one another so I tried this out. I had to add a padding='same' because of it.

In [None]:
import tensorflow as tf
from tensorflow.keras import models, layers

model = models.Sequential()
model.add(layers.Conv2D(64, 3, activation='relu', input_shape=(28,28,1)))
model.add(layers.Conv2D(64, 3, padding='same', activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Dropout(0.2))
model.add(layers.Conv2D(128, 3, activation='relu'))
model.add(layers.Conv2D(128, 3, padding='same', activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(256, 3, activation='relu'))
model.add(layers.Conv2D(256, 3, padding='same', activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
model.summary()

# Complile and train
I was playing around with the early stopping parameters here. The weights are restored to the best epoch.

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=5, restore_best_weights=True)
history = model.fit(x=trainImages, y=trainLabels, epochs=100, shuffle=True, validation_split=0.25)

# Evaluation

In [None]:
plt.plot(history.history['sparse_categorical_accuracy'], label='accuracy')
plt.plot(history.history['val_sparse_categorical_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.95, 1])
plt.legend(loc='lower right')

# Making predictions and writing to CSV
Each prediction for an image contains the probability for each label. So we have to grab the highest probability and get it's index with np.argmax. Then we create a Dataframe and write the csv without indicies.

In [None]:
testImages, testLabels = generateData(r'/kaggle/input/digit-recognizer/test.csv')
predictions = model.predict(testImages, verbose=2)
print(predictions)

In [None]:
predictionsLabel = np.argmax(predictions, axis=1)
print(type(predictionsLabel), len(predictionsLabel), predictionsLabel.shape, predictionsLabel)

In [None]:
df = pd.DataFrame({'ImageId': [i for i in range(1,len(predictionsLabel)+1)], 'Label': predictionsLabel})
df

In [None]:
df.to_csv('/kaggle/working/predictions.csv', index=False)

In [None]:
model.save('/kaggle/working')

# Conclusion
These were my first competition submissions and I was quite fun to see an improvement in the leaderboards after some tinkering. But I certainly should learn some more about these pandas.