In [None]:
# MNIST - https://www.kaggle.com/datasets/hojjatk/mnist-dataset
data_dir = '/Users/rytis/Desktop/EU_parama/data/mnist/'

In [None]:
import numpy as np
from array import array
import struct
from os.path  import join
import random
import matplotlib.pyplot as plt

In [None]:
class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath
    
    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())        
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels
            
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)     

In [None]:
input_path = data_dir
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

In [None]:
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images)/cols) + 1
    plt.figure(figsize=(30,20))
    index = 1    
    for x in zip(images, title_texts):        
        image = x[0]        
        title_text = x[1]
        plt.subplot(rows, cols, index)        
        plt.imshow(image, cmap='viridis')
        if (title_text != ''):
            plt.title(title_text, fontsize = 15);        
        index += 1

In [None]:
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

In [None]:
images_show = []
titles_show = []
for i in range(0, 5):
    r = random.randint(1, 60000)
    images_show.append(x_train[r])
    titles_show.append('training image [' + str(r) + '] = ' + str(y_train[r]))    

for i in range(0, 5):
    r = random.randint(1, 10000)
    images_show.append(x_test[r])        
    titles_show.append('test image [' + str(r) + '] = ' + str(y_test[r]))    

show_images(images_show, titles_show)

In [None]:
def plot_distribution(subset_name, class_counts, class_labels):
    plt.figure(figsize=(10, 6))

    # Add grid lines on x, y
    plt.grid(axis='x', linestyle='--', alpha=0.5)
    plt.grid(axis='y', linestyle='--', alpha=0.5)

    # Create a bar plot
    bars = plt.bar(class_labels, class_counts)

    # Add count text on top of the bars
    for bar, count in zip(bars, class_counts):
        plt.text(bar.get_x() + bar.get_width() / 2, count, str(count),
                 ha='center', va='bottom')

    plt.xticks(class_labels)
    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.title(f"{subset_name} class distribution")
    plt.show()

In [None]:
# Class distribution for train subset
train_class_counts = np.bincount(y_train)
train_class_labels = np.unique(y_train)
plot_distribution('Train', train_class_counts, train_class_labels)

In [None]:
# Class distribution for test subset
test_class_counts = np.bincount(y_test)
test_class_labels = np.unique(y_test)
plot_distribution('Test', test_class_counts, test_class_labels)

In [None]:
# Feature analysis (pixel intensity distribution for the whole dataset)
def draw_pixel_distribution(subset_name, data, grouping=30):
    plt.figure(figsize=(10, 6))
    plt.hist(data, bins=grouping, range=(0, 256))
    plt.xlabel("Pixel Intensity")
    plt.ylabel("Frequency")
    plt.title(f"{subset_name} pixel intensity distribution [bins={grouping}]")
    # Add grid lines on x, y
    plt.grid(axis='x', linestyle='--', alpha=0.5)
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.show()

In [None]:
# cast list of list to np.array
x_train = [np.array(sublist) for sublist in x_train]
x_test = [np.array(sublist) for sublist in x_test]

In [None]:
# draw pixel distribution for train
draw_pixel_distribution('Train', np.concatenate(x_train).flatten())

In [None]:
# draw pixel distribution for test
draw_pixel_distribution('Test', np.concatenate(x_test).flatten())