In [1]:
# Install required packages
%pip install -q numpy pandas pyarrow matplotlib seaborn scikit-learn tensorflow xgboost emnist
%reset -f

In [15]:
# Import packages
import os
import string
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import emnist
from IPython.display import display, Markdown

# ML packages
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, make_scorer, classification_report
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
# XGBoost (SVM)
from xgboost import XGBClassifier
# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D

In [3]:
# Constants
SIZE = 28
REBUILD = True
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [4]:
# Helper functions

# Convert an integer label to the corresponding uppercase character
def int_to_char(label):
    if label < 10:
      return str(label)
    elif label < 36:
      return chr(label - 10 + ord('A'))
    else:
      return chr(label - 36 + ord('a'))

# Display a single image and its corresponding label
def show_image(row):
  image = row['image']
  label = row['label']
  plt.imshow(image, cmap='gray')
  plt.title('Label: ' + int_to_char(label))
  plt.axis('off')
  plt.show()

# Display a list of images as a grid of num_cols columns
def show_grid(data, title=None, num_cols=5, fig_size=(20,10)):
  num_images = len(data)
  num_rows = (num_images - 1) // num_cols + 1
  fig, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
  if title is not None:
    fig.subtitle(title, fontsize=16)
  for i in range(num_rows):
      for j in range(num_cols):
          index = i * num_cols + j
          if index < num_images:
              axes[i, j].imshow(data.iloc[index]['image'], cmap='gray')
              axes[i, j].axis('off')
              label = int_to_char(data.iloc[index]['label'])
              axes[i, j].set_title(label)
  plt.show()

# Get a random image of a given label from the dataset
def get_image_by_label(data, label):
  images = data[data['label'] == label]['image'].tolist()
  return random.choice(images)

# Plot the training and validation accuracy during the training of the model
def plot_accuracy(history):
  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  epochs = range(1, len(acc) + 1)
  plt.plot(epochs, acc, 'bo', label = 'Training Accuracy')
  plt.plot(epochs, val_acc, 'b', label = 'Validation Accuracy')
  plt.title('Training and Validation Accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.legend()
  plt.show()

# Plot the training and validation loss during the training of the model
def plot_loss(history):
  loss = history.history['loss']
  val_loss = history.history['val_loss']
  epochs = range(1, len(loss) + 1)
  plt.plot(epochs, loss, 'bo', label='Training loss')
  plt.plot(epochs, val_loss, 'b', label='Validation loss')
  plt.title('Training and validation loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()
  plt.show()

# Normalize the pixel values of the images in the dataset to have zero mean and unit variance
# This is a common preprocessing step for neural networks
def normalize_images(images):
  images = np.array(images)
  mean = images.mean()
  std = images.std()
  images = (images - mean) / std
  return images.tolist()

# Display metrics for a model
def display_metrics(task, model_name, metrics_dict):
  metrics_df = pd.DataFrame()
  cm_df = pd.DataFrame()
  for key, value in metrics_dict[task][model_name].items():
    if type(value) == np.ndarray:
      cm_df = pd.DataFrame(value, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
    else:
      metrics_df[key] = [value]
  display(Markdown(f'## Performance Metrics: {model_name}'))
  display(metrics_df)
  display(Markdown(f'## Confusion Matrix: {model_name}'))
  display(cm_df)


In [5]:
# Load EMNIST 'byclass' data

# Extract the training split as images and labels
image, label = emnist.extract_training_samples('byclass')

# Add columns for each pixel value (28x28 = 784 columns)
emnist_train = pd.DataFrame()

# Add a column with the image data as a 28x28 array
emnist_train['image'] = list(image)
emnist_train['image_flat'] = emnist_train['image'].apply(lambda x: np.array(x).reshape(-1))

# Add a column showing the label
emnist_train['label'] = label

# Convert labels to characters
class_label = np.array([int_to_char(l) for l in label])

# Add a column with the character corresponding to the label
emnist_train['class'] = class_label

# Repeat for the test split
image, label = emnist.extract_test_samples('byclass')
class_label = np.array([int_to_char(l) for l in label])
emnist_test = pd.DataFrame()
emnist_test['image'] = list(image)
emnist_test['image_flat'] = emnist_test['image'].apply(lambda x: np.array(x).reshape(-1))
emnist_test['label'] = label
emnist_test['class'] = class_label

# Combine the training and test data for later use
emnist_all = pd.concat([emnist_train, emnist_test], axis=0)

# Subset for only digits 0-9
digits = emnist_all[emnist_all['label'] < 10]

# Subset for lower- and uppercase letters
lowercase = emnist_all[(emnist_all['class'] >= 'a') & (emnist_all['class'] <= 'z')]
uppercase = emnist_all[(emnist_all['class'] >= 'A') & (emnist_all['class'] <= 'Z')]

# Subset for upper- and lowercase letters a, b, c, d, e, f, g
a2g = emnist_all[(emnist_all['class'].isin(['a', 'b', 'c', 'd', 'e', 'f', 'g']))]

# Subset for upper- and lowercase letters a, b, c, x, y, z
abcxyz = emnist_all[(emnist_all['class'].isin(['a', 'b', 'c', 'A', 'B', 'C', 'x', 'y', 'z', 'X', 'Y', 'Z']))]


Downloading emnist.zip: 536MB [00:03, 144MB/s]


In [16]:
# Create a dictionary for performance metrics
metrics_dict = {}
metrics_dict['a_to_g'] = {}
metrics_dict['upper_vs_lower'] = {}
metrics_dict['validation'] = {}



In [17]:
# Display the size of a2g, abcxyz, digits, and the full dataset
display(Markdown(f'# Dataset Sizes'))
display(Markdown(f'**a2g**: {len(a2g)}'))
display(Markdown(f'**abcxyz**: {len(abcxyz)}'))
display(Markdown(f'**digits**: {len(digits)}'))
display(Markdown(f'**emnist_all**: {len(emnist_all)}'))

# Dataset Sizes

**a2g**: 68795

**abcxyz**: 65926

**digits**: 402953

**emnist_all**: 814255

In [28]:
# Classify letters a to g

# Define small sample dataset for testing
sample_size = 3000
a2g_sample = a2g.sample(sample_size, random_state=42)

# Define features and labels
X = np.vstack(a2g_sample['image_flat'].values)
y = a2g_sample['class']

# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Preprocess data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define model
model = RandomForestClassifier(random_state=42)
params = {'n_estimators': [100, 500, 1000]}

# Perform GridSearch to find the best params
grid = GridSearchCV(model, params, cv=5)
grid.fit(X_train_scaled, y_train)

print("Best Hyperparameters::\n{}".format(grid.best_params_))

# Train the model using the best params
best_rf = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], random_state=42)
best_rf.fit(X_train_scaled, y_train)

# Predict test set results
y_pred = best_rf.predict(X_test_scaled)

# Evaluate and store results
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
metrics_dict['a_to_g']={'accuracy': accuracy, 'classification_report':report, 'confusion_matrix': cm}

# Display metrics
display(Markdown(f'### Accuracy:'))
print(metrics_dict['a_to_g']['accuracy'])

display(Markdown(f'### Classification Report:'))
print(metrics_dict['a_to_g']['classification_report'])

display(Markdown(f'### Confusion Matrix:'))
print(metrics_dict['a_to_g']['confusion_matrix'])


Best Hyperparameters::
{'n_estimators': 500}


### Accuracy:

0.9333333333333333


### Classification Report:

              precision    recall  f1-score   support

           a       0.86      0.97      0.91        87
           b       0.98      0.84      0.91        58
           c       0.92      0.72      0.81        32
           d       0.91      0.97      0.94        95
           e       0.96      0.97      0.97       278
           f       0.93      0.83      0.88        30
           g       0.89      0.80      0.84        20

    accuracy                           0.93       600
   macro avg       0.92      0.87      0.89       600
weighted avg       0.94      0.93      0.93       600



### Confusion Matrix:

[[ 84   0   0   0   3   0   0]
 [  0  49   1   6   0   1   1]
 [  2   0  23   0   7   0   0]
 [  2   0   0  92   0   1   0]
 [  5   0   1   1 271   0   0]
 [  2   1   0   1   0  25   1]
 [  3   0   0   1   0   0  16]]


In [11]:
# Classify letters as uppercase/lowercase

# Create an 'is_upper' column and set the values
abcxyz = abcxyz.copy()
abcxyz['is_upper'] = abcxyz['class'].apply(lambda x: x.isupper())

# Print a sample of the 'abcxyz' dataset to verify
print(abcxyz.sample(10))

# Prepare data for modelling
# Features
#X = abcxyz['image_flat'].to_numpy()

# Labels
#y = abcxyz['is_upper'].to_numpy()

# Initialize the train/test split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameters
logistic_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
random_forest_params = {'n_estimators':[10,50,100]}
xgboost_params = {'n_estimators': [10,50,100], 'learning_rate': [0.01, 0.1]}

# Define models
models = [
    {'name': 'logistic_regression', 'model': LogisticRegression(max_iter=1000, random_state=42), 'params': logistic_params},
    {'name': 'random_forest', 'model': RandomForestClassifier(random_state=42), 'params': random_forest_params},
    {'name': 'xgboost', 'model': XGBClassifier(random_state=42), 'params': xgboost_params},
    {'name': 'neural_network', 'model': Sequential([Flatten(input_shape=(784,)), Dense(1, activation='sigmoid')]), 'params': None}
]

# Define scores for model evaluation
scores = ['accuracy', 'precision', 'recall', 'f1']

#sample_size = 3000
#sample = abcxyz.sample(sample_size, random_state=42)
#valid_frac = 0.3
#valid = sample.sample(frac=valid_frac, random_state=42)
#train_test = sample.drop(valid.index)

# Split data into train/test and validation sets
valid_frac = 0.3
valid = abcxyz.sample(frac=valid_frac, random_state=42)
train_test = abcxyz.drop(valid.index)

# K-fold cross-validation

k = 5
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

winning_model = None
winning_name = None
wins = 0

scaler = StandardScaler()

for train_index, test_index in kf.split(train_test['image_flat'].apply(lambda x: tuple(x)), train_test['is_upper']):
  train = train_test.iloc[train_index]
  test = train_test.iloc[test_index]

  # Preprocess data
  train_scaled = scaler.fit_transform(np.vstack(train['image_flat'].values))
  test_scaled = scaler.fit_transform(np.vstack(test['image_flat'].values))

  # Split dataset into features and labels
  y_train = train['is_upper']
  y_test = test['is_upper']

  for model_info in models:
    model_name = model_info['name']
    model = model_info['model']
    params = model_info['params']

    # Check to see if model is a neural network and compile it
    if model_name=="neural_network":
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train and evaluate model
    if params is not None:
      # Perform GridSearch
      grid = GridSearchCV(model, params, cv=5, scoring=make_scorer(accuracy_score))
      grid.fit(train_scaled, y_train)
      model = grid.best_estimator_

    else:
      # Train model normally
      model.fit(train_scaled, y_train)

    y_pred = (model.predict(test_scaled) > 0.5).astype("int32")

    # Calculate and store performance metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    metrics_dict['upper_vs_lower'][model_name] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'confusion_matrix': cm}

    # Determine if current model is the new winning model (based on f1 score)
    if wins < f1:
      wins = f1
      winning_model = model
      winning_name = model_name

                                                    image  \
450922  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
690982  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
499876  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
235593  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
682609  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
365633  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
62997   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
691390  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
123611  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
600596  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   

                                               image_flat  label class  \
450922  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     11     B   
690982  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     59     x   
499876  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     12     C   
235593  [0, 0, 0, 0, 0, 0, 0, 0,

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



In [12]:
# Apply winning model to validation set
valid_scaled = scaler.transform(np.vstack(valid['image_flat'].values))
y_pred = winning_model.predict(valid_scaled)

# If winning model is a neural network, convert probabilistic outputs to binary
if isinstance(winning_model, Sequential):
  y_pred = (y_pred > 0.5).astype("int32")

# Calculate and display performance metrics for winning model on validation set
acc = accuracy_score(valid['is_upper'], y_pred)
prec = precision_score(valid['is_upper'], y_pred)
rec = recall_score(valid['is_upper'], y_pred)
f1 = f1_score(valid['is_upper'], y_pred)
cm = confusion_matrix(valid['is_upper'], y_pred)

metrics_dict['validation'][winning_name] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'confusion_matrix': cm}

# Display metrics
display(Markdown(f'# Train/Test'))
display_metrics('upper_vs_lower', winning_name, metrics_dict)
display(Markdown(f'# Validation'))
display_metrics('validation', winning_name, metrics_dict)

# Train/Test

## Performance Metrics: random_forest

Unnamed: 0,accuracy,precision,recall,f1
0,0.843362,0.826207,0.901843,0.86237


## Confusion Matrix: random_forest

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3235,947
Actual 1,490,4502


# Validation

## Performance Metrics: random_forest

Unnamed: 0,accuracy,precision,recall,f1
0,0.83785,0.824394,0.887027,0.854564


## Confusion Matrix: random_forest

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7149,2007
Actual 1,1200,9422
