In [1]:
import numpy as np
import struct
from array import array
import os
from os.path  import join
import random as rn
import graphviz
import pydotplus
from IPython.display import Image
from io import StringIO
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
import time
import itertools
import matplotlib as mpl
import pandas as pd

from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.random_projection import SparseRandomProjection

In [2]:
"""
Got this code from https://www.kaggle.com/code/hojjatk/read-mnist-dataset/notebook for 
instructions on how to load data from the MNIST dataset
"""
class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath
    
    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())        
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
#             img = img.reshape(28, 28)
            images[i][:] = img            
        
        return np.array(images), np.array(labels)
            
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)  

In [3]:
#
# Verify Reading Dataset via MnistDataloader class
#
%matplotlib inline
import random
import matplotlib.pyplot as plt

#
# Set file paths based on added MNIST Datasets
#
input_path = 'data/mnist_datafolder'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

#
# Helper function to show a list of images with their relating titles
#
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images)/cols) + 1
    plt.figure(figsize=(30,20))
    index = 1    
    for x in zip(images, title_texts):        
        image = x[0]        
        title_text = x[1]
        plt.subplot(rows, cols, index)        
        plt.imshow(image, cmap=plt.cm.gray)
        if (title_text != ''):
            plt.title(title_text, fontsize = 15);        
        index += 1

#
# Load MINST dataset
#
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

In [4]:
"""
Setting up seed values for reproducability
"""
seed = 1234
np.random.seed(seed)
rn.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [5]:
"""
Getting the size values for the training and testing dataset
"""
size_train_samples = np.shape(x_train)[0]
size_test_samples = np.shape(x_test)[0]
print(np.shape(x_train)[1])

784


In [6]:
"""
Taking a subset of the MNIST dataset since the 60000 it is doing is taking very long
"""

subset_size = 10000

random_indexes = np.random.choice(size_train_samples, subset_size, replace = False)
x_train_subset = np.take(x_train, random_indexes, axis=0)
y_train_subset = np.take(y_train, random_indexes)

In [9]:
"""
Running Sparse Randomized Projections on the MNIST Dataset to achieve Dimension Reductionality.
This method is to find the best number of components based on the Reconstruction Error score.
"""

n_components_range = list(range(0, 800, 50))
n_components_range[0] = 2
n_components_range[-1] = 784

# Hyperparameters for RP
density = 'auto'
eps = 0.1
dense_output = False

# Scores for the reconstruction error
reconstruction_error_scores = []

for n_components in n_components_range:
    rp = SparseRandomProjection(
        n_components = n_components,
        density = density,
        eps = eps,
        dense_output = dense_output,
        random_state = seed
    )
    
    x_transformed = rp.fit_transform(x_train_subset)
    x_projected = rp.inverse_transform(x_transformed)
    
    reconstruction_error_scores.append(np.sum((x_train_subset - x_projected) ** 2, axis=1).mean())


AttributeError: 'SparseRandomProjection' object has no attribute 'inverse_transform'