In [1]:
# import required packages/libraries:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import math
import csv

In [2]:
# Notes: 
# Dataset: https://archive.ics.uci.edu/ml/datasets/Letter+Recognition
# Objective: train a model that can identify letters from obscured / distorted text.

In [3]:
# load dataset using csv file (even though it's a .data file):
csv.field_size_limit(100000000)
data = []

with open('letter-recognition.data', 'r', encoding="utf8") as file:
    rows = csv.reader(file)
    for row in rows:
        row[0] = ord(row[0])-65
        int_list = list(map(int, row))
        arr = np.array(int_list)
        data.append(arr)

# convert to numpy array for access to np functions.
npdata = np.array(data)

In [4]:
# sort the data by letter. store in a list of lists (of numpy arrays)
letter_array = []

# create 26 buckets, 1 for each letter.
for i in range(26):
    letter_array.append([])

# letter_array[0] is a LIST of all entries corresponding to the letter 'A'
for row in npdata:
    letter_array[row[0]].append(row[1:])

In [5]:
# shuffle the data.
for letter in letter_array:
    np.random.shuffle(letter)
    
# splitting into testing and training data.
train_percent = 0.8


# creating separate list of numpy arrays for easy iteration.
training_dataX = []
for i in range(26):
    training_dataX.append([])
    
training_dataY = []
for i in range(26):
    training_dataY.append([])
    
testing_dataX = []
for i in range(26):
    testing_dataX.append([])
    
testing_dataY = []
for i in range(26):
    testing_dataY.append([])

# for each letter, adding to respective lists above.
for i in range(26):
    zeros = np.zeros(26)
    zeros[i] = 1
    cutoff = math.floor(len(letter_array[i]) * train_percent)
    
    training_dataX[i] = letter_array[i][:cutoff]
    training_dataY[i] = np.tile(zeros, (len(training_dataX[i]), 1))
    
    testing_dataX[i] = letter_array[i][cutoff:]
    testing_dataY[i] = np.tile(zeros, (len(testing_dataX[i]), 1))

# create combined training data.
X_train = np.concatenate(tuple(training_dataX))
Y_train = np.concatenate(tuple(training_dataY))
X_test = np.concatenate(tuple(testing_dataX))
Y_test = np.concatenate(tuple(testing_dataY))

In [6]:
# building out the model
model = keras.models.Sequential()

# adding the first hidden layer: note shape=(16,) because I removed the actual letter identifier.
model.add(layers.Dense(512, input_shape=(16,)))
model.add(layers.Activation('relu'))                            
model.add(layers.Dropout(0.2))

# adding the second hidden layer
model.add(layers.Dense(512))
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.2))

# adding the output layer: should be one of 26 outputs (one for each letter)
model.add(layers.Dense(26))
model.add(layers.Activation('softmax'))

In [7]:
# compiling the model (straight from sample code)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [8]:
# training the model (straight from sample code)
history = model.fit(X_train, Y_train,
          batch_size=128, epochs=20,
          verbose=2,
          validation_data=(X_test, Y_test))

Train on 15989 samples, validate on 4011 samples
Epoch 1/20
15989/15989 - 2s - loss: 1.6073 - accuracy: 0.5289 - val_loss: 0.8642 - val_accuracy: 0.7412
Epoch 2/20
15989/15989 - 1s - loss: 0.8528 - accuracy: 0.7411 - val_loss: 0.6389 - val_accuracy: 0.8090
Epoch 3/20
15989/15989 - 1s - loss: 0.6625 - accuracy: 0.7937 - val_loss: 0.4992 - val_accuracy: 0.8532
Epoch 4/20
15989/15989 - 1s - loss: 0.5414 - accuracy: 0.8301 - val_loss: 0.4196 - val_accuracy: 0.8724
Epoch 5/20
15989/15989 - 1s - loss: 0.4584 - accuracy: 0.8592 - val_loss: 0.3669 - val_accuracy: 0.8913
Epoch 6/20
15989/15989 - 1s - loss: 0.4024 - accuracy: 0.8725 - val_loss: 0.3331 - val_accuracy: 0.8968
Epoch 7/20
15989/15989 - 1s - loss: 0.3596 - accuracy: 0.8839 - val_loss: 0.2806 - val_accuracy: 0.9130
Epoch 8/20
15989/15989 - 1s - loss: 0.3277 - accuracy: 0.8952 - val_loss: 0.2654 - val_accuracy: 0.9187
Epoch 9/20
15989/15989 - 1s - loss: 0.2993 - accuracy: 0.9026 - val_loss: 0.2412 - val_accuracy: 0.9260
Epoch 10/20
159