In [1]:
import numpy as np 
import cv2
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
from minisom import MiniSom
from collections import defaultdict
from sklearn.metrics import accuracy_score

In [2]:
# Define the path to the root folder of your dataset
root_folder = "MNIST Dataset/"

# Define lists to store the image data and labels
data = []
labels = []

# Loop over the folders in the root folder
for i in range(10):
    folder_path = root_folder + str(i) + "/"

    # Get the label corresponding to the folder name
    label = i

    # Loop over the images in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg'):
            image_path = os.path.join(folder_path, filename)

            # Read the image using OpenCV and convert it to grayscale
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Add the image and label to the dataset lists
            data.append(image)
            labels.append(label)

# Convert the data and labels lists to numpy arrays
data = np.array(data)
labels = np.array(labels)

# Split the data into training and testing sets, with 90% of each class for training
train_data = []
train_labels = []
test_data = []
test_labels = []

for i in range(10):
    class_data = data[labels == i]
    class_labels = labels[labels == i]

    train_class_data, test_class_data, train_class_labels, test_class_labels = train_test_split(class_data, class_labels, test_size=0.1)

    train_data.append(train_class_data)
    train_labels.append(train_class_labels)
    test_data.append(test_class_data)
    test_labels.append(test_class_labels)

# Concatenate the training and testing data and labels for each class
train_data = np.concatenate(train_data)
train_labels = np.concatenate(train_labels)
test_data = np.concatenate(test_data)
test_labels = np.concatenate(test_labels)

# Create a pandas dataframe to store the data and labels
train_df = pd.DataFrame(train_data.reshape(train_data.shape[0], -1))
train_df['label'] = train_labels

test_df = pd.DataFrame(test_data.reshape(test_data.shape[0], -1))
test_df['label'] = test_labels

#shuffle data frame:
train_df = train_df.sample(frac = 1)
test_df = test_df.sample(frac = 1)


In [3]:
# prepare Train and Test  Data and labels
TrainLabels = train_df['label']
TrainData = train_df.drop('label',axis=1)
TestLabels = test_df['label']
TestData = test_df.drop('label',axis=1)
TrainData = np.array(TrainData)
TrainLabels = np.array(TrainLabels)
TestLabels = np.array(TestLabels)
TestData = np.array(TestData)

In [4]:
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,label
1923,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
293,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
208,0,0,0,0,0,0,0,0,0,16,...,3,2,0,0,0,0,0,0,0,1
1129,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,5
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1329,0,0,0,0,0,0,0,0,0,6,...,0,0,0,0,0,0,0,0,0,6
1120,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,5
1791,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,8
823,0,0,0,0,0,0,0,0,0,13,...,0,0,0,0,0,0,0,0,0,4


In [5]:
#true Winner takes all approach with only 10 clusters 
# Load the training and test data
training_data = TrainData
test_data = TestData
training_labels = TrainLabels
test_labels = TestLabels

# Flatten the training and test data and normalize it:
training_data = training_data.reshape(training_data.shape[0], -1) / 255.
test_data = test_data.reshape(test_data.shape[0], -1) / 255.

# Define the parameters for the SOM:
input_len = 784  # number of features
classes = 10     # number of classes
som_size = 10    # size of the SOM
sigma = 1        # neighborhood radius
learning_rate = 0.2

# Implement the winner-takes-all approach:
# Create a SOM with one neuron for each class
som = MiniSom(som_size, som_size, input_len, sigma=sigma, learning_rate=learning_rate, 
              neighborhood_function='gaussian', random_seed=0)
som.pca_weights_init(training_data)
som.train_batch(training_data, 300000, verbose=True)
# Determine the number of images of each class that are placed in each cluster
class_counts = np.zeros((som_size, som_size, classes))
for index, label in np.ndenumerate(training_labels):
    i, j = som.winner(training_data[index])
    class_counts[i, j, label] += 1

# Determine the label for each cluster
class_labels = np.zeros((som_size, som_size))
for i in range(som_size):
    for j in range(som_size):
        class_labels[i, j] = np.argmax(class_counts[i, j])
# Determine the final class labels
class_labels_final = np.zeros((classes,))
for i in range(som_size):
    for j in range(som_size):
        class_labels_final[int(class_labels[i, j])] = np.argmax(class_counts[i, j])
        
# Reshape the class_labels_final array to (10,1) or (1,10)
class_labels_final = class_labels_final.reshape((1, 10))  # or (10, 1)

# Compute the train accuracy
train_labels_pred = np.zeros(len(training_labels))
for i in range(len(training_data)):
    closest_neuron_dist, closest_neuron = som.winner(training_data[i])
    train_labels_pred[i] = class_labels_final[0, closest_neuron]
train_accuracy = accuracy_score(training_labels, train_labels_pred)
print('Train accuracy:', train_accuracy*100 , ' %')


# Compute the test accuracy
test_labels_pred = np.zeros(len(test_labels))
for i in range(len(test_data)):
    closest_neuron_dist, closest_neuron = som.winner(test_data[i])
    test_labels_pred[i] = class_labels_final[0, closest_neuron]
test_accuracy = accuracy_score(test_labels, test_labels_pred)
print('Test accuracy:', test_accuracy*100 , ' %')

# Compute the Euclidean distances between each training sample and each neuron in the SOM
weights = som.get_weights()
distances = np.zeros((len(training_data), som_size * som_size))
for i in range(len(training_data)):
    for j in range(som_size * som_size):
        distances[i, j] = np.linalg.norm(training_data[i] - weights[j // som_size, j % som_size])

# Assign each training sample to the neuron it is closest to
assignments = np.argmin(distances, axis=1)

# Compute the DBI for the SOM
dbi_wta = davies_bouldin_score(training_data, assignments)
print('DBI for Winner-Takes-All approach:', dbi_wta)

print("Cluster Labels:", class_labels_final)




 [ 300000 / 300000 ] 100% - 0:00:00 left 
 quantization error: 5.145042647387838
Train accuracy: 6.194444444444445  %
Test accuracy: 7.199999999999999  %
DBI for Winner-Takes-All approach: 2.7884678153971936
Cluster Labels: [[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]]


In [6]:
#true winner takes all approach with (10,10). cluserts 

# Load the training and test data
training_data = TrainData
test_data = TestData
training_labels = TrainLabels
test_labels = TestLabels

# Flatten the training and test data and normalize it:
training_data = training_data.reshape(training_data.shape[0], -1) / 255.
test_data = test_data.reshape(test_data.shape[0], -1) / 255.

# Define the parameters for the SOM:
input_len = 784  # number of features
classes = 10     # number of classes
som_size = 10    # size of the SOM
sigma = 1      # neighborhood radius
learning_rate = 0.2

# Implement the true Winner-Takes-All approach:
# Create a SOM with one neuron for each class
som = MiniSom(som_size, som_size, input_len, sigma=sigma, learning_rate=learning_rate, 
              neighborhood_function='gaussian', random_seed=0)
som.pca_weights_init(training_data)
som.train_batch(training_data, 100000, verbose=True)

# Determine the number of images of each class that are placed in each cluster
class_counts = np.zeros((som_size, som_size, classes))
for index, label in np.ndenumerate(training_labels):
    i, j = som.winner(training_data[index])
    class_counts[i, j, label] += 1

# Determine the label for each cluster
class_labels = np.zeros((som_size, som_size))
for i in range(som_size):
    for j in range(som_size):
        class_labels[i, j] = np.argmax(class_counts[i, j])

# Assign each training sample to the neuron it is closest to
train_labels_pred = np.zeros(len(training_labels))
for i in range(len(training_data)):
    closest_neuron_dist, closest_neuron = som.winner(training_data[i])
    train_labels_pred[i] = class_labels[closest_neuron // som_size, closest_neuron % som_size]

# Compute the train accuracy
train_accuracy = accuracy_score(training_labels, train_labels_pred)
print('Train accuracy:', train_accuracy*100,'%')

# Assign each test sample to the neuron it is closest to
test_labels_pred = np.zeros(len(test_labels))
for i in range(len(test_data)):
    closest_neuron_dist, closest_neuron = som.winner(test_data[i])
    test_labels_pred[i] = class_labels[closest_neuron // som_size, closest_neuron % som_size]

# Compute the test accuracy
test_accuracy = accuracy_score(test_labels, test_labels_pred)
print('Test accuracy:', test_accuracy*100,'%')

# Compute the Euclidean distances between each training sample and each neuron in the SOM
weights = som.get_weights()
distances = np.zeros((len(training_data), som_size * som_size))
for i in range(len(training_data)):
    for j in range(som_size * som_size):
        distances[i, j] = np.linalg.norm(training_data[i] - weights[j // som_size, j % som_size])

# Assign each training sample to the neuron it is closest to
assignments = np.argmin(distances, axis=1)

# Compute the DBI for the SOM
dbi_wta = davies_bouldin_score(training_data, assignments)
print('DBI for Winner-Takes-All approach:', dbi_wta)

print("Cluster Labels:",class_labels)


 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 5.159283955491046
Train accuracy: 18.85 %
Test accuracy: 18.95 %
DBI for Winner-Takes-All approach: 2.795395082359505
Cluster Labels: [[2. 2. 2. 2. 2. 0. 0. 0. 5. 8.]
 [2. 2. 2. 2. 0. 0. 0. 0. 5. 3.]
 [6. 6. 6. 6. 0. 0. 0. 0. 3. 3.]
 [6. 6. 6. 6. 5. 5. 5. 5. 5. 3.]
 [9. 6. 6. 2. 5. 5. 5. 5. 3. 3.]
 [7. 7. 7. 7. 4. 8. 8. 8. 3. 3.]
 [7. 7. 7. 7. 7. 8. 8. 8. 2. 6.]
 [4. 9. 9. 9. 9. 8. 8. 8. 2. 1.]
 [4. 9. 4. 4. 9. 9. 5. 1. 1. 1.]
 [9. 4. 4. 4. 4. 7. 1. 1. 1. 1.]]


In [7]:
#On-Center, Off-Surround approach: 30x30
# Load the training and test data
training_data = TrainData
test_data = TestData
training_labels = TrainLabels
test_labels = TestLabels

# Flatten the training and test data and normalize it:
training_data = training_data.reshape(training_data.shape[0], -1) / 255.
test_data = test_data.reshape(test_data.shape[0], -1) / 255.

# Define the parameters for the SOM:
input_len = 784  # number of features
# Define the neighborhood function for the On-Center, Off-Surround approach
def neighborhood_function(distance, sigma):
    return np.exp(-distance**2 / (2*sigma**2)) - np.exp(-1/(2*sigma**2))


# Implement the On-Center, Off-Surround approach:
sigma = 1.5    # standard deviation of the Gaussian neighborhood function
learning_rate = 0.5

# Create a SOM with 30x30 neurons and neighborhood topology
som = MiniSom(30, 30, input_len, sigma=sigma, learning_rate=learning_rate, 
              neighborhood_function='gaussian', topology='rectangular', random_seed=0)
som.pca_weights_init(training_data)
som.train_batch(training_data, 30000, verbose=True)

# Determine the number of images of each class that are placed in each cluster
class_counts = np.zeros((30, 30, classes))
for index, label in np.ndenumerate(training_labels):
    i, j = som.winner(training_data[index])
    class_counts[i, j, label] += 1

# Determine the label for each cluster
class_labels = np.zeros((30, 30))
for i in range(30):
    for j in range(30):
        class_labels[i, j] = np.argmax(class_counts[i, j])

# Determine the final class labels
class_labels_final = np.zeros((30*30,))
for i in range(30):
    for j in range(30):
        class_labels_final[i*30+j] = class_labels[i, j]

# Reshape the class_labels_final array to (10,1) or (1,10)
class_labels_final = class_labels_final.reshape((30, 30))  # or (10, 1)

# Compute the train accuracy
train_labels_pred = np.zeros(len(training_labels))
for i in range(len(training_data)):
    closest_neuron_dist, closest_neuron = som.winner(training_data[i])
    train_labels_pred[i] = class_labels_final[0, closest_neuron]
train_accuracy = accuracy_score(training_labels, train_labels_pred)
print('Train accuracy:', train_accuracy*100 , ' %')

# Compute the test accuracy
test_labels_pred = np.zeros(len(test_labels))
for i in range(len(test_data)):
    closest_neuron_dist, closest_neuron = som.winner(test_data[i])
    test_labels_pred[i] = class_labels_final[0, closest_neuron]
test_accuracy = accuracy_score(test_labels, test_labels_pred)
print('Test accuracy:', test_accuracy*100 , ' %')

# Compute the Euclidean distances between each training sample and each neuron in the SOM
weights = som.get_weights()
distances = np.zeros((len(training_data), 30 * 30))
for i in range(len(training_data)):
    for j in range(30 * 30):
        distances[i, j] = np.linalg.norm(training_data[i] - weights[j // 30, j % 30])

# Assign each training sample to the neuron it is closest to
assignments = np.argmin(distances, axis=1)

#Compute the DBI for the On-Center, Off-Surround approach:
dbi_oos = davies_bouldin_score(training_data, assignments)
print('DBI for On-Center, Off-Surround approach:', dbi_oos)

print("Cluster Labels for On-Center, Off-Surround approach:", class_labels)

 [ 30000 / 30000 ] 100% - 0:00:00 left 
 quantization error: 4.529367046411866
Train accuracy: 19.483333333333334  %
Test accuracy: 18.15  %
DBI for On-Center, Off-Surround approach: 2.6609846069287895
Cluster Labels for On-Center, Off-Surround approach: [[3. 5. 3. 3. 8. 8. 8. 8. 8. 8. 8. 8. 3. 3. 3. 3. 8. 1. 1. 1. 1. 1. 7. 1.
  1. 1. 1. 1. 1. 1.]
 [3. 3. 3. 3. 8. 8. 8. 8. 8. 8. 8. 3. 3. 3. 3. 3. 3. 1. 1. 1. 1. 3. 1. 1.
  1. 1. 1. 1. 1. 1.]
 [3. 3. 3. 3. 8. 8. 8. 8. 8. 8. 8. 8. 3. 3. 3. 3. 3. 1. 1. 1. 1. 8. 1. 1.
  1. 1. 1. 1. 1. 1.]
 [3. 5. 8. 8. 8. 8. 8. 8. 8. 8. 8. 8. 3. 3. 3. 3. 5. 4. 1. 1. 1. 1. 8. 8.
  8. 8. 1. 1. 1. 1.]
 [3. 0. 8. 8. 8. 8. 8. 5. 8. 8. 8. 3. 3. 3. 3. 9. 9. 9. 9. 7. 8. 8. 8. 8.
  8. 8. 8. 1. 1. 1.]
 [0. 0. 8. 8. 8. 8. 5. 5. 5. 8. 8. 8. 2. 9. 3. 9. 9. 9. 7. 7. 7. 8. 8. 8.
  8. 8. 8. 3. 1. 2.]
 [0. 0. 0. 8. 8. 5. 5. 5. 5. 5. 5. 8. 7. 7. 7. 7. 9. 9. 9. 7. 7. 7. 8. 5.
  5. 5. 5. 3. 3. 3.]
 [0. 0. 0. 0. 5. 5. 5. 5. 5. 5. 5. 5. 7. 7. 7. 7. 7. 9. 9. 7. 7. 3. 5. 5.
  5. 5