# Primetals Python implementation of mnist Geometrically Inspired Kernel Machine

In [1]:
import time
import numpy as np
import os
import scipy
import pandas as pd
from func import Classifier, predictionClassifier, combineMultipleClassifiers

## load data for mnist

In [None]:
# Construct the file path for the .mat file
# Assuming the current working directory is the Python equivalent of MATLAB's pwd
print(os.getcwd())
dataset_path = os.path.join(os.getcwd().replace('GIKM_python', 'Datasets'), 'MNIST', 'mnist_all.mat')

# Load the .mat file
data = scipy.io.loadmat(dataset_path)

# Extract and convert train and test data to double (float) and normalize
train0 = data['train0']
train1 = data['train1']
train2 = data['train2']
train3 = data['train3']
train4 = data['train4']
train5 = data['train5']
train6 = data['train6']
train7 = data['train7']
train8 = data['train8']
train9 = data['train9']

test0 = data['test0']
test1 = data['test1']
test2 = data['test2']
test3 = data['test3']
test4 = data['test4']
test5 = data['test5']
test6 = data['test6']
test7 = data['test7']
test8 = data['test8']
test9 = data['test9']

# Concatenate all training data and normalize by dividing by 255
y_data_trn = np.hstack([
    train0.T, train1.T, train2.T, train3.T, train4.T,
    train5.T, train6.T, train7.T, train8.T, train9.T
]).astype(np.float64) / 255.0

# Print the shape of the training data for verification
print(f"shape of train0: {train0.shape}")
print("Shape of y_data_trn:", y_data_trn.shape)

# Similarly, load test data if needed
# test0 = data['test0']
# test1 = data['test1']
# ...
# and concatenate them as needed
y_data_trn = np.tanh(y_data_trn)

# Create labels for training data
labels_trn = np.hstack([
    1 * np.ones(train0.shape[0]),
    2 * np.ones(train1.shape[0]),
    3 * np.ones(train2.shape[0]),
    4 * np.ones(train3.shape[0]),
    5 * np.ones(train4.shape[0]),
    6 * np.ones(train5.shape[0]),
    7 * np.ones(train6.shape[0]),
    8 * np.ones(train7.shape[0]),
    9 * np.ones(train8.shape[0]),
    10 * np.ones(train9.shape[0])
]).astype(int)

# Convert test data to double (float64) and normalize
y_data_test = np.hstack([
    test0.T, test1.T, test2.T, test3.T, test4.T,
    test5.T, test6.T, test7.T, test8.T, test9.T
]).astype(np.float64) / 255.0

# Apply tanh activation function
y_data_test = np.tanh(y_data_test)

# Create labels for test data
labels_test = np.hstack([
    1 * np.ones(test0.shape[0]),
    2 * np.ones(test1.shape[0]),
    3 * np.ones(test2.shape[0]),
    4 * np.ones(test3.shape[0]),
    5 * np.ones(test4.shape[0]),
    6 * np.ones(test5.shape[0]),
    7 * np.ones(test6.shape[0]),
    8 * np.ones(test7.shape[0]),
    9 * np.ones(test8.shape[0]),
    10 * np.ones(test9.shape[0])
]).astype(int)

# Unique labels and number of labels
labels = np.unique(labels_trn)
Q = len(labels)

# Initialize lists to store results for each class
min_distance_arr = [None] * Q
labels_arr_arr = [None] * Q
max_modeling_error = 0


In [None]:
for i in range(Q):
    # Find indices for the current label
    ind = np.where(labels_trn == labels[i])[0]

    # Train the classifier for the current class data
    CLF = Classifier(y_data_trn[:, ind], labels_trn[ind], subspace_dim=20, Nb=1000)

    # Perform prediction on the test data
    min_distance_arr[i], labels_arr_arr[i] = predictionClassifier(y_data_test, CLF)

    # Track the maximum modeling error encountered
    max_modeling_error = max(max_modeling_error, CLF['max_modeling_error'])

# Combine results from multiple classifiers
_, hat_labels_test = combineMultipleClassifiers(min_distance_arr, labels_arr_arr)

# Calculate accuracy of the predictions
acc = np.mean(hat_labels_test == labels_test)

print(f"Overall maximum modeling error: {max_modeling_error}")
print(f"Accuracy: {acc}")

## Code Rundown
- For every **label** in labels:
  - find all **ind** (indexes) in the train set belonging to the class **label**
  - create a Classifier **CLF** with the data (X,y) belonging to class **label** with subspace dim 20 and Nb defining the number of cluster/subsets
    - for every unique class label define a separate **autoencoder** with function **parallelAutoencoders**
      - Depending on the ratio between Number of samples **N** and subset size **Nb**
      - **if** more than one cluster
        - cluster the data with **Kmeans**
        - eliminate clusters with less than two samples in it
        - Reducing the Number of Clusters **(While Loop)**
        - find clusters with less than 30 samples, repeat process from reducing clusters
        - per cluster train an **Autoencoder** with function **parallel Autoencoder**
          - Reduce data dimensionality and project data onto new dimensions with **dimReduce** and return **y_data_subspace** and **PC** (Principal components)
          - if value has not enough significant spread within data, reduce dimensionality by one
          - compute the weights matrix with the inverse of the covariance matrix
          - calc **KxxMatrix** with x matrix, weights matrix and kerneltype (so far only gaussian)
            - define size of Kxx matrix with dimensions of x matrix
            - calculate distance between all samples and calculate squared distance with the weights matrix
            - apply gaussian filter with $ \exp(-(0.5/n)* \text{Kxx})$
          - calculate **kernel least squares** with **KxxMatrix** and **ydata**
            - regularize data and control overfit with lambda
            - run multiple regulations to improve accuracy
          - apply **AutoencoderFiltering** to project data to autoencoders
            - project data into autoencoder subspace
            - handle large number of samples in batches
            - for each batch, kernel based weight calculation
            - Calculate distance between new projected data and train data
  - define **PredictionClassifier** with **Classifier** and **y_data_test** input
    - for each class label calculate the distance for every data point with the specific autoencoder to the label
      - apply **AutoencoderFiltering** to calculate the distances
    - predicted label depends on the minimum distance to the class within the distance_matrix for every label