
# Contents

0. Setup
1. Acquisition of Data
2. Model Building


# Section 0: Setup

## Prerequisites are essential by definition

In [1]:
%%html

<style>
.cell-output-ipywidget-background{
    background-color: transparent !important;
}

.jp-OutputArea-output{
    background-color: transparent;
    color: white;
}
</style>

In [None]:
import pathlib
import mlcroissant as mlc
import contextlib
import zipfile, rarfile, tarfile
import re
import pickle
import io
from PIL import Image
from typing import Any, Callable, Generator
from IPython.display import display, clear_output
import secrets
import ipywidgets as widgets
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import numpy.typing as npt

2025-06-10 13:15:15.415598: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-10 13:15:15.875065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749536116.070968    3333 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749536116.137239    3333 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749536116.670808    3333 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [None]:

jsonld_path: pathlib.Path = pathlib.Path("./croissants/handwrittenmathsymbols-metadata.json")
dataset_url: str = "https://www.kaggle.com/datasets/xainano/handwrittenmathsymbols"
load_image_dict: bool = True
dataset_data_archive_path: pathlib.Path = pathlib.Path("./data.rar")
image_dict_pickle_path: pathlib.Path = pathlib.Path("./pickles/image_dict.pkl")
dataset_shuffle_buffer_size: int | None = 10000
train_split: float = 0.8
dataset_batch_size: int = 32
load_model: bool = True
model_path: pathlib.Path = pathlib.Path("./keras models/model.keras")

model_init_hidden_layers: list[layers.Layer] = [
    layers.Conv2D(64, 3, activation = "relu"),
    layers.MaxPooling2D(),
    layers.Conv2D(64, 3, activation = "relu"),
    layers.MaxPooling2D(),
    layers.Conv2D(48, 3, activation = "relu"),
    layers.SpatialDropout2D(0.2),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(48, activation = "relu"),
    layers.Dense(32, activation = "relu"),
    layers.Dropout(0.4),
    layers.Dense(16, activation = "relu")
]

train_epochs: int = 3

2025-06-10 13:15:27.224164: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)



# Section 1: Acquisition of Data

## The very foundation of analysis

In [None]:
metadata: mlc.Metadata = mlc.Dataset(jsonld = jsonld_path).metadata

print(f"\n\n\n\x1b[2J\x1b[93;1mDataset at {dataset_url}\x1b[0m\n\n{metadata.name}\nPublished: {metadata.date_published}")

  -  [Metadata(Handwritten math symbols dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.





[2J[93;1mDataset at https://www.kaggle.com/datasets/xainano/handwrittenmathsymbols[0m

Handwritten math symbols dataset
Published: 2017-01-15 16:49:28.723000


In [None]:
image_dict: dict[str, list[bytes]] = {}

@contextlib.contextmanager
def open_archive(path: pathlib.Path) -> Generator[zipfile.ZipFile | rarfile.RarFile | tarfile.TarFile, None, None]:
    if not path.is_file():
        raise FileNotFoundError(f"File not found")

    file: zipfile.ZipFile | rarfile.RarFile | tarfile.TarFile | None = None

    match path.suffix:
        case ".zip":
            file =  zipfile.ZipFile(path)
        case ".rar":
            file =  rarfile.RarFile(path)
        case suffix if re.search(suffix, r"\.tar(\.[^ \n]+)?"):
            file = tarfile.open(path)
        case _:
            raise ValueError("File type not supported")
    try:
        yield file
    finally:
        if file is not None:
            file.close()

def archive_type_switch(archive_file: zipfile.ZipFile | rarfile.RarFile | tarfile.TarFile, input: tuple[Any, ...], zip_or_rar_callback: Callable[..., Any], tar_callback: Callable[[Any], Any]) -> Any:
    match type(archive_file):
        case zipfile.ZipFile | rarfile.RarFile:
            return zip_or_rar_callback(*input)
        case tarfile.TarFile:
            return tar_callback(*input)

def dict_get_data_archive(image_dict: dict[str, list[bytes]], path: pathlib.Path) -> None:
    with open_archive(path) as archive_file:
        for entry in archive_type_switch(archive_file, (archive_file, ), lambda x: x.infolist(), lambda x: x.get_members()):
            if archive_type_switch(archive_file, (entry, ), lambda x: x.is_dir(), lambda x: x.isdir()):
                continue

            with archive_file.open(entry, "r") as image_file:
                entry_label: str = entry.filename.split('/')[-2]

                image_dict.setdefault(entry_label, []).append(image_file.read())
                print(f"Added {entry.filename} with label {entry_label}")

try:
    assert load_image_dict

    with open(image_dict_pickle_path, "rb") as pickle_file:
        print("Loading from pickle")
        image_dict = pickle.load(pickle_file)
except Exception:
    import kagglehub

    with open(image_dict_pickle_path, "wb") as pickle_file:
        print("Downloading dataset")
        dataset_path: pathlib.Path = pathlib.Path(kagglehub.dataset_download(dataset_url[dataset_url.rfind("datasets") + len("datasets") + 1:]))
        print("Fetching from dataset")
        dict_get_data_archive(image_dict, dataset_path.joinpath(dataset_data_archive_path))
        pickle.dump(image_dict, pickle_file)

del open_archive, archive_type_switch, dict_get_data_archive
print("Done!")

Added extracted_images/N/exp86752.jpg with label N
Added extracted_images/N/exp86757.jpg with label N
Added extracted_images/N/exp86758.jpg with label N
Added extracted_images/N/exp86760.jpg with label N
Added extracted_images/N/exp86762.jpg with label N
Added extracted_images/N/exp86768.jpg with label N
Added extracted_images/N/exp86774.jpg with label N
Added extracted_images/N/exp86778.jpg with label N
Added extracted_images/N/exp86782.jpg with label N
Added extracted_images/N/exp86784.jpg with label N
Added extracted_images/N/exp86786.jpg with label N
Added extracted_images/N/exp8679.jpg with label N
Added extracted_images/N/exp86792.jpg with label N
Added extracted_images/N/exp86793.jpg with label N
Added extracted_images/N/exp86796.jpg with label N
Added extracted_images/N/exp86801.jpg with label N
Added extracted_images/N/exp86803.jpg with label N
Added extracted_images/N/exp86810.jpg with label N
Added extracted_images/N/exp86890.jpg with label N
Added extracted_images/N/exp8689

KeyboardInterrupt: 

In [None]:
def show_image_n_label(output: widgets.Output, image_dict: dict[str, list[bytes]], image_display_size: tuple[int, int], resample: int) -> None:
    with output:
        random_label: str = secrets.choice(list(image_dict.keys()))
        image_bytes: bytes = secrets.choice(image_dict[random_label])

        clear_output(wait = True)
        display(Image.open(io.BytesIO(image_bytes)).resize(image_display_size, Image.Resampling.NEAREST))
        print(f"What you just saw is \x1b[1m{random_label}\x1b[0m")

image_display_size: tuple[int, int] = (200, 200)
output: widgets.Output = widgets.Output()
next_button: widgets.Button = widgets.Button(description = "Next Image")

next_button.on_click(lambda button: show_image_n_label(output, image_dict, image_display_size, Image.Resampling.NEAREST))
del show_image_n_label
print("\x1b[92mNow cryptographically secure!\x1b[0m")
next_button.click()
display(output)
display(next_button)

[92mNow cryptographically secure![0m


Output()

Button(description='Next Image', style=ButtonStyle())

In [None]:
def bytes_to_array_generator(image_dict: dict[str, list[bytes]], label_lookup: layers.StringLookup) -> Generator[tuple[npt.NDArray[np.float32], tf.Tensor], None, None]:
    for key in image_dict.keys():
        for image_bytes in image_dict[key]:
            yield tf.io.decode_image(image_bytes, channels = 3), tf.squeeze(label_lookup(key))

image_dict_key_list: list[str] = list(image_dict.keys())
image_size: tuple[int, int] = Image.open(io.BytesIO(list(image_dict.values())[0][0])).size

output_signature: tuple[tf.TensorSpec] = (
    tf.TensorSpec(shape = (image_size[0], image_size[1], 3), dtype = tf.dtypes.float32),
    tf.TensorSpec(shape = (len(image_dict_key_list) + 1, ), dtype = tf.dtypes.int32)
)

label_lookup: layers.StringLookup = layers.StringLookup(vocabulary = image_dict_key_list, output_mode = "one_hot")
dataset: tf.data.Dataset = tf.data.Dataset.from_generator(bytes_to_array_generator, args = (image_dict, label_lookup), output_signature = output_signature)

In [None]:
def get_dataset_length(image_dict: dict[str, list[bytes]]) -> int:
    total_length: int = 0

    for value in image_dict.values():
        total_length += len(value)

    return total_length

dataset_count: int = get_dataset_length(image_dict)

dataset = dataset.shuffle(dataset_count if dataset_shuffle_buffer_size is None else dataset_shuffle_buffer_size)

train_elements: int = int(dataset_count * train_split)
train_dataset: tf.data.Dataset = dataset.take(train_elements)
val_dataset: tf.data.Dataset = dataset.skip(train_elements)

print(f"Counted {dataset_count} total elements\nTraining set: {train_elements}\nValidation set: {dataset_count - train_elements}")

train_dataset = train_dataset.batch(dataset_batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(dataset_batch_size).prefetch(tf.data.AUTOTUNE)


# Section 2: Model Building

## What is Image Classification without an Image Classifier?

In [None]:
def init_image_classifier(input_shape: tuple[int, ...], hidden_layers: list[layers.Layer], output_shape: int) -> keras.Sequential:
    model: keras.Sequential = keras.Sequential()

    model.add(layers.Input(input_shape))

    for layer in hidden_layers:
        model.add(layer)

    model.add(layers.Dense(output_shape, activation = "softmax"))
    return model

try:
    assert load_model

    model: keras.Sequential = keras.models.load_model(model_path)
except:
    model: keras.Sequential = init_image_classifier((image_size[0], image_size[1], 3), model_init_hidden_layers, len(image_dict_key_list) + 1)

    model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])
finally:
    early_stop_callback: keras.callbacks.EarlyStopping = keras.callbacks.EarlyStopping(
        patience = 3,
        restore_best_weights = True
    )

    checkpoint_callback: keras.callbacks.ModelCheckpoint = keras.callbacks.ModelCheckpoint(
        filepath = model_path,
        save_best_only = True
    )

    model.fit(train_dataset, validation_data = val_dataset,epochs = train_epochs, callbacks = [early_stop_callback, checkpoint_callback], verbose = 1)