In [1]:
import pickle
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
import os
import logging
import tensorflow as tf
import warnings

2024-10-06 17:15:12.104851: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-06 17:15:12.139237: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Data Loading

In [2]:
def load_data(file_path):
    with open(file_path, 'rb') as file:
        dataset = pickle.load(file)
    return dataset['data'], dataset['labels']

data_path = 'EMNIST_Byclass_Small/'
X_train, y_train = load_data(data_path+'emnist_train.pkl')
X_test, y_test = load_data(data_path+'emnist_test.pkl')

print(X_train[0])



[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0  18  36  18   5   0   3  21  37  37  37  37  21
    3   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   3 106 208 111  41  34  84 170 215 217 217 215 170
   79  34   3   0   0   0   0   0   0   0]
 [  0   0   0   0   0   2  77 217 192 159 204 233 251 254 254 252 250 249
  220 174  67   3   0   0   0   0   0   0]
 [  0   0   0   0   0   0  11 142 248 251 254 253 234 217 215 172 130 173
  233 244 174  34   0   0   0   0   0   0]
 [  0   0   0   0   0   0  10 142 250 254 254 248 179 129 125  82  41  84
  187 233 220  79   3   0   0   0   0   0]
 [  0   0   0   0   0   0  46 208 254 254 254 233  84   7   4   2   0   4
   84 173 249 170  21   2   0   0   0   0]
 [  0   0   0   0   1  10 128 246 254 255 254 218  48   0   0   

In [3]:
X_train = X_train.reshape(-1, 28, 28, 1).astype('float32') / 255.0
X_test = X_test.reshape(-1, 28, 28, 1).astype('float32') / 255.0

num_classes = 62 
train_labels = to_categorical(y_train, num_classes)
test_labels = to_categorical(y_test, num_classes)

print(f"Training data shape: {X_train.shape}, Training labels shape: {train_labels.shape}")
print(f"Test data shape: {y_test.shape}, Test labels shape: {test_labels.shape}")

Training data shape: (100000, 28, 28, 1), Training labels shape: (100000, 62)
Test data shape: (20000,), Test labels shape: (20000, 62)


In [4]:
epochs = 20
batch_size = 32
loss="sparse_categorical_crossentropy"
#loss="categorical_crossentropy"
input_shape = (28, 28, 1)
num_classes = 62
hyperparameters = {
    'hidden_size' : 64,
    'dropout_rate' : 0.3,
    'learning_rate' : 1e-4,
    'activation' : 'relu',
    'output_activation' : 'softmax'
}

In [5]:
def residual_block(inputs, filters, kernel_size=3, stride=1, dropout_rate=0.3, activation='relu'):
    x = layers.Conv2D(filters, kernel_size=kernel_size, strides=stride, padding="same")(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)
    x = layers.Dropout(dropout_rate)(x) 

    x = layers.Conv2D(filters, kernel_size=kernel_size, strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)

    if stride != 1 or inputs.shape[-1] != filters:
        inputs = layers.Conv2D(filters, kernel_size=1, strides=stride, padding='same')(inputs)
        inputs = layers.BatchNormalization()(inputs)
    x = layers.Dropout(dropout_rate)(x) 

    x = layers.Add()([x, inputs])
    x = layers.Activation(activation)(x)
    return x

def build_resnet(input_shape, num_classes, hyperparameters):
    inputs = layers.Input(shape=input_shape)
    dropout_rate = hyperparameters['dropout_rate']
    hidden_size = hyperparameters['hidden_size']
    activation= hyperparameters['activation']
    
    # Initial convolution layer
    x = layers.Conv2D(hidden_size, (3, 3), padding='same', strides=1)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)

    # Residual blocks
    x = residual_block(x, hidden_size, dropout_rate=dropout_rate, activation=activation)
    x = residual_block(x, hidden_size*2, stride=2, dropout_rate=dropout_rate, activation=activation)  # downsample
    x = residual_block(x, hidden_size*4, stride=2, dropout_rate=dropout_rate, activation=activation)  # downsample

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(hidden_size*8, activation=activation)(x)
    x = layers.Dropout(dropout_rate)(x)
    
    outputs = layers.Dense(num_classes, activation=hyperparameters['output_activation'])(x)

    model = models.Model(inputs, outputs)
    
    optimizer=Adam(learning_rate=hyperparameters['learning_rate'])
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    return model

In [6]:
def build_cnn(input_shape, num_classes, hyperparameters):
    hidden_size = hyperparameters['hidden_size']
    dropout_rate = hyperparameters['dropout_rate']
    activation= hyperparameters['activation']
    
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv2D(hidden_size, (3, 3))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(dropout_rate-0.1)(x)

    x = layers.Conv2D(hidden_size*2, (3, 3))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(dropout_rate)(x)

    x = layers.Conv2D(hidden_size*4, (3, 3))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(dropout_rate+0.1)(x)

    x = layers.Flatten()(x)

    # Fully connected layer
    x = layers.Dense(hidden_size)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)

    outputs = layers.Dense(num_classes, activation=hyperparameters['output_activation'])(x)
    
    model = models.Model(inputs, outputs)
    
    optimizer=Adam(learning_rate=hyperparameters['learning_rate'])
    model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

    return model

In [7]:
def build_mlp(input_shape, num_classes, hidden_size):
    hidden_size = hyperparameters['hidden_size']
    dropout_rate = hyperparameters['dropout_rate']
    activation= hyperparameters['activation']
    inputs = layers.Input(shape=input_shape)

    # Flatten the input (28, 28, 1) -> (784,)
    x = layers.Flatten(input_shape=input_shape)(inputs)

    x = layers.Dense(hidden_size*8)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)
    x = layers.Dropout(dropout_rate-0.1)(x)

    x = layers.Dense(hidden_size*4)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)
    x = layers.Dropout(dropout_rate)(x)

    x = layers.Dense(hidden_size*2)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)
    x = layers.Dropout(dropout_rate+0.1)(x)

    outputs = layers.Dense(num_classes, activation=hyperparameters['output_activation'])(x)
    
    model = models.Model(inputs, outputs)
    
    optimizer=Adam(learning_rate=hyperparameters['learning_rate'])
    model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

    return model

In [8]:
model_res = build_resnet(input_shape, num_classes, hyperparameters)
history = model_res.fit(
    X_train, y_train,                 
    validation_split=0.2,   
    epochs=epochs,                        
    batch_size=batch_size
)

2024-10-06 17:15:13.679394: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-06 17:15:13.701946: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-06 17:15:13.701984: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-06 17:15:13.704568: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-06 17:15:13.704602: I external/local_xla/xla/stream_executor

Epoch 1/20


I0000 00:00:1728195316.712849   84408 service.cc:145] XLA service 0x7f8fc0001480 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728195316.712899   84408 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2024-10-06 17:15:16.778556: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-10-06 17:15:17.072662: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907






[1m   5/2500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:08[0m 27ms/step - accuracy: 0.0314 - loss: 4.1815 

I0000 00:00:1728195325.423758   84408 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 7ms/step - accuracy: 0.3694 - loss: 2.5082 - val_accuracy: 0.7398 - val_loss: 0.8204
Epoch 2/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - accuracy: 0.7434 - loss: 0.8254 - val_accuracy: 0.8235 - val_loss: 0.5323
Epoch 3/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - accuracy: 0.7938 - loss: 0.6248 - val_accuracy: 0.8282 - val_loss: 0.5016
Epoch 4/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - accuracy: 0.8125 - loss: 0.5512 - val_accuracy: 0.8403 - val_loss: 0.4695
Epoch 5/20
[1m 221/2500[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m14s[0m 6ms/step - accuracy: 0.8285 - loss: 0.5135

In [9]:
model_cnn = build_cnn(input_shape, num_classes, hyperparameters)
history = model_cnn.fit(
    X_train, y_train,                 
    validation_split=0.2,   
    epochs=epochs,                        
    batch_size=batch_size
)

  super().__init__(


Epoch 1/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.3289 - loss: 2.7847 - val_accuracy: 0.6598 - val_loss: 1.2142
Epoch 2/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6417 - loss: 1.2533 - val_accuracy: 0.7253 - val_loss: 0.8955
Epoch 3/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.6977 - loss: 0.9835 - val_accuracy: 0.7551 - val_loss: 0.7842
Epoch 4/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7293 - loss: 0.8536 - val_accuracy: 0.7721 - val_loss: 0.6992
Epoch 5/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7522 - loss: 0.7736 - val_accuracy: 0.7811 - val_loss: 0.6603
Epoch 6/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7644 - loss: 0.7110 - val_accuracy: 0.7935 - val_loss: 0.6202
Epoch 7/20
[1m2

In [10]:
model_mlp = build_mlp(input_shape, num_classes, hyperparameters)
history = model_mlp.fit(
    X_train, y_train,                
    validation_split=0.2,   
    epochs=epochs,                        
    batch_size=batch_size
)

Epoch 1/20


  super().__init__(**kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.3284 - loss: 2.8211 - val_accuracy: 0.6636 - val_loss: 1.2684
Epoch 2/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.6053 - loss: 1.4698 - val_accuracy: 0.7255 - val_loss: 0.9408
Epoch 3/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.6636 - loss: 1.1659 - val_accuracy: 0.7577 - val_loss: 0.7952
Epoch 4/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7048 - loss: 0.9942 - val_accuracy: 0.7771 - val_loss: 0.7176
Epoch 5/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7282 - loss: 0.9021 - val_accuracy: 0.7892 - val_loss: 0.6654
Epoch 6/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7475 - loss: 0.8215 - val_accuracy: 0.7990 - val_loss: 0.6244
Epoch 7/20
[1m2500/2500[0