
# Contents

0. Setup
1. Acquisition of Data
2. Model Building


# Section 0: Setup

## Prerequisites are essential by definition

In [2]:
import pathlib
import mlcroissant as mlc
import contextlib
import zipfile, rarfile, tarfile
import re
import pickle
import kagglehub
import io
from PIL import Image
from typing import Any, Callable, Generator
from IPython.display import display, clear_output
import secrets
import ipywidgets as widgets
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import numpy.typing as npt

2025-06-16 17:01:49.937675: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750068110.154554    1272 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750068110.205181    1272 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750068110.644748    1272 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750068110.644782    1272 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750068110.644784    1272 computation_placer.cc:177] computation placer alr

In [3]:
%%html

<style>
.cell-output-ipywidget-background{
    background-color: transparent !important;
}

.jp-OutputArea-output{
    background-color: transparent;
    color: white;
}
</style>

In [17]:
dataset_url: str = "https://www.kaggle.com/datasets/xainano/handwrittenmathsymbols"

jsonld_path: pathlib.Path = pathlib.Path("./croissants/handwrittenmathsymbols-metadata.json")
dataset_data_archive_path: pathlib.Path = pathlib.Path("./data.rar")
image_dict_pickle_path: pathlib.Path = pathlib.Path("./pickles/image_dict.pkl")
model_path: pathlib.Path = pathlib.Path("./keras_models/model.keras")

load_image_dict: bool = True
load_model: bool = True

image_display_resize: tuple[int, int] = (200, 200)
image_display_resampling: int = Image.Resampling.BICUBIC
dataset_image_resize: tuple[int, int] = (45, 45)
dataset_image_resampling: str = tf.image.ResizeMethod.BICUBIC

dataset_shuffle_buffer_size: int | None = 10000
train_split: float = 0.8
dataset_batch_size: int = 64

model_init_hidden_layers: list[layers.Layer] = [
    layers.Conv2D(64, 3, activation = "relu"),
    layers.MaxPooling2D(),
    layers.Conv2D(64, 3, activation = "relu"),
    layers.MaxPooling2D(),
    layers.Conv2D(48, 3, activation = "relu"),
    layers.SpatialDropout2D(0.2),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(48, activation = "relu"),
    layers.Dense(32, activation = "relu"),
    layers.Dropout(0.4),
    layers.Dense(16, activation = "relu")
]

train_epochs: int = 16


# Section 1: Acquisition of Data

## The very foundation of analysis

In [18]:
metadata: mlc.Metadata = mlc.Dataset(jsonld = jsonld_path).metadata

print(f"\n\n\n\x1b[2J\x1b[93;1mDataset at {dataset_url}\x1b[0m\n\n{metadata.name}\nPublished: {metadata.date_published}")

  -  [Metadata(Handwritten math symbols dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.





[2J[93;1mDataset at https://www.kaggle.com/datasets/xainano/handwrittenmathsymbols[0m

Handwritten math symbols dataset
Published: 2017-01-15 16:49:28.723000


In [19]:
image_dict: dict[str, list[bytes]] = {}

@contextlib.contextmanager
def open_archive(path: pathlib.Path) -> Generator[zipfile.ZipFile | rarfile.RarFile | tarfile.TarFile, None, None]:
    if not path.is_file():
        raise FileNotFoundError(f"File not found")

    file: zipfile.ZipFile | rarfile.RarFile | tarfile.TarFile | None = None

    match path.suffix:
        case ".zip":
            file =  zipfile.ZipFile(path)
        case ".rar":
            file =  rarfile.RarFile(path)
        case suffix if re.search(suffix, r"\.tar(\.[^ \n]+)?"):
            file = tarfile.open(path)
        case _:
            raise ValueError("File type not supported")
    try:
        yield file
    finally:
        if file is not None:
            file.close()

def archive_type_switch(archive_file: zipfile.ZipFile | rarfile.RarFile | tarfile.TarFile, input: tuple[Any, ...], zip_or_rar_callback: Callable[..., Any], tar_callback: Callable[[Any], Any]) -> Any:
    match type(archive_file):
        case zipfile.ZipFile | rarfile.RarFile:
            return zip_or_rar_callback(*input)
        case tarfile.TarFile:
            return tar_callback(*input)

def dict_get_data_archive(image_dict: dict[str, list[bytes]], path: pathlib.Path) -> None:
    with open_archive(path) as archive_file:
        for entry in archive_type_switch(archive_file, (archive_file, ), lambda x: x.infolist(), lambda x: x.get_members()):
            if archive_type_switch(archive_file, (entry, ), lambda x: x.is_dir(), lambda x: x.isdir()):
                continue

            with archive_file.open(entry, "r") as image_file:
                entry_label: str = entry.filename.split('/')[-2]

                image_dict.setdefault(entry_label, []).append(archive_type_switch(archive_file, (archive_file, ), lambda x: x.read(), lambda x: x.extractfile()))
                print(f"Added {entry.filename} with label {entry_label}")

try:
    assert load_image_dict

    with open(image_dict_pickle_path, "rb") as pickle_file:
        print("Loading from pickle")
        image_dict = pickle.load(pickle_file)
except Exception:
    with open(image_dict_pickle_path, "wb") as pickle_file:
        print("Downloading dataset")
        dataset_path: pathlib.Path = pathlib.Path(kagglehub.dataset_download(dataset_url[dataset_url.rfind("datasets") + len("datasets") + 1:]))
        print("Fetching from dataset")
        dict_get_data_archive(image_dict, dataset_path.joinpath(dataset_data_archive_path))
        pickle.dump(image_dict, pickle_file)

del open_archive, archive_type_switch, dict_get_data_archive
print("Done!")

Loading from pickle
Done!


In [None]:
def show_image_n_label(output: widgets.Output, image_dict: dict[str, list[bytes]], image_display_size: tuple[int, int], resampling: int) -> None:
    with output:
        random_label: str = secrets.choice(list(image_dict.keys()))
        image_bytes: bytes = secrets.choice(image_dict[random_label])

        clear_output(wait = True)
        display(Image.open(io.BytesIO(image_bytes)).resize(image_display_size, resampling))
        print(f"Label: {random_label}")

image_display_size: tuple[int, int] = (200, 200)
output: widgets.Output = widgets.Output()
next_button: widgets.Button = widgets.Button(description = "Next Image")

next_button.on_click(lambda button: show_image_n_label(output, image_dict, image_display_size, image_display_resampling))
next_button.click()
display(output)
display(next_button)

Output()

Button(description='Next Image', style=ButtonStyle())

In [21]:
def dataset_generator(image_dict: dict[str, list[bytes]], label_lookup: layers.StringLookup) -> Generator[tuple[bytes, tf.Tensor], None, None]:
    for key in image_dict.keys():
        for image_bytes in image_dict[key]:
            yield image_bytes, tf.squeeze(label_lookup(key))

def preprocess_image_bytes(image_bytes: bytes) -> tf.Tensor:
    image: tf.Tensor = tf.io.decode_image(image_bytes, channels = 3)

    image.set_shape([None, None, 3])
    image = tf.image.resize_with_pad(image, dataset_image_resize[0], dataset_image_resize[1], dataset_image_resampling)
    image = tf.image.convert_image_dtype(image, tf.float32)
    return image

image_dict_key_list: list[str] = list(image_dict.keys())
image_size: tuple[int, int] = Image.open(io.BytesIO(list(image_dict.values())[0][0])).size

output_signature: tuple[tf.TensorSpec] = (
    tf.TensorSpec(shape = (), dtype = tf.string),
    tf.TensorSpec(shape = (len(image_dict_key_list) + 1, ), dtype = tf.int32)
)

label_lookup: layers.StringLookup = layers.StringLookup(vocabulary = image_dict_key_list, output_mode = "one_hot")
dataset: tf.data.Dataset = tf.data.Dataset.from_generator(lambda: dataset_generator(image_dict, label_lookup), output_signature = output_signature)

dataset = dataset.map(lambda x, y: (preprocess_image_bytes(x), y), num_parallel_calls = tf.data.AUTOTUNE, deterministic = False)

In [22]:
def get_dataset_length(image_dict: dict[str, list[bytes]]) -> int:
    total_length: int = 0

    for value in image_dict.values():
        total_length += len(value)

    return total_length

dataset_count: int = get_dataset_length(image_dict)

dataset = dataset.shuffle(dataset_count if dataset_shuffle_buffer_size is None else dataset_shuffle_buffer_size)

train_elements: int = int(dataset_count * train_split)
train_dataset: tf.data.Dataset = dataset.take(train_elements)
val_dataset: tf.data.Dataset = dataset.skip(train_elements)

print(f"Counted {dataset_count} total elements\nTraining set: {train_elements}\nValidation set: {dataset_count - train_elements}")

train_dataset = train_dataset.batch(dataset_batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(dataset_batch_size).prefetch(tf.data.AUTOTUNE)

Counted 375974 total elements
Training set: 300779
Validation set: 75195



# Section 2: Model Building

## What is Image Classification without an Image Classifier?

In [24]:
def init_image_classifier(input_shape: tuple[int, ...], hidden_layers: list[layers.Layer], output_shape: int) -> keras.Sequential:
    model: keras.Sequential = keras.Sequential()

    model.add(layers.Input(input_shape))

    for layer in hidden_layers:
        model.add(layer)

    model.add(layers.Dense(output_shape, activation = "softmax"))
    return model

try:
    assert load_model

    model: keras.Sequential = keras.models.load_model(model_path)
except:
    model: keras.Sequential = init_image_classifier((image_size[0], image_size[1], 3), model_init_hidden_layers, len(image_dict_key_list) + 1)

    model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])
finally:
    early_stop_callback: keras.callbacks.EarlyStopping = keras.callbacks.EarlyStopping(
        patience = 3,
        restore_best_weights = True
    )

    checkpoint_callback: keras.callbacks.ModelCheckpoint = keras.callbacks.ModelCheckpoint(
        filepath = model_path,
        save_best_only = True
    )

    model.fit(train_dataset, validation_data = val_dataset,epochs = train_epochs, callbacks = [early_stop_callback, checkpoint_callback], verbose = 1)

Epoch 1/16


2025-06-16 18:48:54.659712: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 4370 of 10000
2025-06-16 18:49:07.076356: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


   4699/Unknown [1m741s[0m 152ms/step - accuracy: 0.6130 - loss: 1.3612

2025-06-16 19:01:04.764505: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:549] Omitted potentially buggy algorithm eng14{k25=0} for conv %cudnn-conv-bias-activation.9 = (f32[43,64,43,43]{3,2,1,0}, u8[0]{0}) custom-call(f32[43,3,45,45]{3,2,1,0} %bitcast.7662, f32[64,3,3,3]{3,2,1,0} %bitcast.7669, f32[64]{0} %bitcast.8282), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", metadata={op_type="Conv2D" op_name="sequential_1_1/conv2d_1/convolution" source_file="/home/tebapunix/math-symbol-classifier/.venv/lib/python3.10/site-packages/tensorflow/python/framework/ops.py" source_line=1200}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kNone","side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false}
2025-06-16 19:01:04.795149: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:549] Om

   4700/Unknown [1m744s[0m 153ms/step - accuracy: 0.6130 - loss: 1.3612

2025-06-16 19:01:17.710174: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 4617 of 10000
2025-06-16 19:01:27.710543: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 9125 of 10000
2025-06-16 19:01:29.687193: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.
2025-06-16 19:12:29.465948: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-06-16 19:12:29.517304: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:549] Omitted potentially buggy algorithm eng14{k25=0} for conv %cudnn-conv-bias-activation.9 = (f32[64,64,43,43]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,3,45,45]{3,2,1,0} %bitcast.609, f32[64,3,3,3]{3,2,1,0} %bit

[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1573s[0m 329ms/step - accuracy: 0.6130 - loss: 1.3611 - val_accuracy: 0.0077 - val_loss: 10.7376
Epoch 2/16


2025-06-16 19:14:56.316317: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
2025-06-16 19:14:56.316396: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3763693996949390649
2025-06-16 19:14:56.316452: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3372084804896467218
2025-06-16 19:15:06.426739: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 4573 of 10000


[1m   1/4700[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m28:39:02[0m 22s/step - accuracy: 0.0000e+00 - loss: 3.8405

2025-06-16 19:15:18.208168: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step - accuracy: 0.7953 - loss: 0.6630

2025-06-16 19:27:15.510855: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 4633 of 10000
2025-06-16 19:27:25.512787: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 9238 of 10000
2025-06-16 19:27:27.186050: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1551s[0m 325ms/step - accuracy: 0.7953 - loss: 0.6631 - val_accuracy: 0.0099 - val_loss: 12.7220
Epoch 3/16


2025-06-16 19:40:47.284554: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3763693996949390649
2025-06-16 19:40:47.284622: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3372084804896467218
2025-06-16 19:40:57.304633: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 4556 of 10000


[1m   1/4700[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m28:27:05[0m 22s/step - accuracy: 0.0000e+00 - loss: 3.8967

2025-06-16 19:41:08.944981: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step - accuracy: 0.8423 - loss: 0.5209

2025-06-16 19:53:09.734318: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 4508 of 10000
2025-06-16 19:53:21.704048: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1584s[0m 332ms/step - accuracy: 0.8423 - loss: 0.5210 - val_accuracy: 0.0318 - val_loss: 16.6940
Epoch 4/16


2025-06-16 20:07:10.871766: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3763693996949390649
2025-06-16 20:07:10.871843: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3372084804896467218
2025-06-16 20:07:20.898855: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 4604 of 10000
2025-06-16 20:07:30.899200: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 9161 of 10000


[1m   1/4700[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m28:42:49[0m 22s/step - accuracy: 0.0000e+00 - loss: 4.4214

2025-06-16 20:07:32.742507: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step - accuracy: 0.8536 - loss: 0.7393

2025-06-16 20:19:51.431843: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 4545 of 10000
2025-06-16 20:20:03.635671: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1623s[0m 341ms/step - accuracy: 0.8536 - loss: 0.7395 - val_accuracy: 0.0268 - val_loss: 15.1023


2025-06-16 20:34:14.015223: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3763693996949390649
2025-06-16 20:34:14.015313: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3372084804896467218
