In [1]:
import numpy as np 
import cv2
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
from minisom import MiniSom
from collections import defaultdict
from sklearn.metrics import accuracy_score

In [2]:
# Define the path to the root folder of your dataset
root_folder = "MNIST Dataset/"

# Define lists to store the image data and labels
data = []
labels = []

# Loop over the folders in the root folder
for i in range(10):
    folder_path = root_folder + str(i) + "/"

    # Get the label corresponding to the folder name
    label = i

    # Loop over the images in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg'):
            image_path = os.path.join(folder_path, filename)

            # Read the image using OpenCV and convert it to grayscale
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Add the image and label to the dataset lists
            data.append(image)
            labels.append(label)

# Convert the data and labels lists to numpy arrays
data = np.array(data)
labels = np.array(labels)

# Split the data into training and testing sets, with 90% of each class for training
train_data = []
train_labels = []
test_data = []
test_labels = []

for i in range(10):
    class_data = data[labels == i]
    class_labels = labels[labels == i]

    train_class_data, test_class_data, train_class_labels, test_class_labels = train_test_split(class_data, class_labels, test_size=0.1)

    train_data.append(train_class_data)
    train_labels.append(train_class_labels)
    test_data.append(test_class_data)
    test_labels.append(test_class_labels)

# Concatenate the training and testing data and labels for each class
train_data = np.concatenate(train_data)
train_labels = np.concatenate(train_labels)
test_data = np.concatenate(test_data)
test_labels = np.concatenate(test_labels)

# Create a pandas dataframe to store the data and labels
train_df = pd.DataFrame(train_data.reshape(train_data.shape[0], -1))
train_df['label'] = train_labels

test_df = pd.DataFrame(test_data.reshape(test_data.shape[0], -1))
test_df['label'] = test_labels

#shuffle data frame:
train_df = train_df.sample(frac = 1)
test_df = test_df.sample(frac = 1)


In [3]:
# prepare Train and Test  Data and labels
TrainLabels = train_df['label']
TrainData = train_df.drop('label',axis=1)
TestLabels = test_df['label']
TestData = test_df.drop('label',axis=1)
TrainData = np.array(TrainData)
TrainLabels = np.array(TrainLabels)
TestLabels = np.array(TestLabels)
TestData = np.array(TestData)

In [4]:
#winner takes all approach , many clusters 
# Load the training and test data
training_data = TrainData
test_data = TestData
training_labels = TrainLabels
test_labels = TestLabels

# Flatten the training and test data and normalize it:
training_data = training_data.reshape(training_data.shape[0], -1) / 255.
test_data = test_data.reshape(test_data.shape[0], -1) / 255.

# Define the parameters for the SOM:
input_len = 784  # number of features
classes = 10     # number of classes
som_size = 30    # size of the SOM
sigma = 1.0      # neighborhood radius
learning_rate = 0.5

# Implement the Winner-Takes-All approach:
# Create a SOM with one neuron for each class
som = MiniSom(som_size, som_size, input_len, sigma=sigma, learning_rate=learning_rate, 
              neighborhood_function='gaussian', random_seed=0)
som.pca_weights_init(training_data)
som.train_batch(training_data, 100000, verbose=True)

# Determine the number of images of each class that are placed in each cluster
labels_map = som.labels_map(training_data, training_labels)
class_counts = np.zeros((som_size, som_size, classes))
for index, label in np.ndenumerate(training_labels):
    i, j = som.winner(training_data[index])
    class_counts[i, j, label] += 1

# Determine the label for each cluster
class_labels = np.zeros((som_size, som_size))
for i in range(som_size):
    for j in range(som_size):
        class_labels[i, j] = np.argmax(class_counts[i, j])

# Compute the DBI evaluation criteria on the clustering result
labels_map = som.labels_map(training_data, training_labels)
# dbi_wta = davies_bouldin_score(training_data, np.array([class_labels[x] for x in labels_map]))
dbi_wta = davies_bouldin_score(training_data, training_labels)
print('dbi',dbi_wta)
# Compute the train data accuracy
train_labels_pred = [np.argmax(class_counts[som.winner(x)]) for x in training_data]
accuracy_wta_train = np.mean(np.equal(training_labels, train_labels_pred))
print('Train Accuracy for Winner-Takes-All approach:', accuracy_wta_train)

# Compute the test data accuracy
test_labels_pred = [np.argmax(class_counts[som.winner(x)]) for x in test_data]
accuracy_wta_test = np.mean(np.equal(test_labels, test_labels_pred))
print('Test Accuracy for Winner-Takes-All approach:', accuracy_wta_test)

print("Cluster Labels:",class_labels)



 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 4.331078724100794
dbi 3.935366642316235
Train Accuracy for Winner-Takes-All approach: 0.9314444444444444
Test Accuracy for Winner-Takes-All approach: 0.9215
Cluster Labels: [[0. 0. 0. 0. 0. 0. 0. 1. 1. 2. 2. 2. 8. 6. 4. 4. 5. 5. 8. 8. 3. 3. 3. 3.
  3. 3. 5. 5. 5. 0.]
 [0. 5. 0. 0. 6. 1. 1. 1. 1. 1. 7. 3. 3. 4. 4. 7. 5. 8. 8. 8. 3. 3. 5. 3.
  5. 3. 3. 3. 5. 5.]
 [6. 5. 5. 3. 1. 1. 1. 1. 1. 1. 3. 7. 2. 2. 4. 9. 8. 8. 8. 8. 3. 3. 3. 3.
  3. 3. 3. 3. 3. 3.]
 [7. 5. 1. 2. 1. 1. 1. 1. 1. 1. 7. 7. 7. 7. 7. 7. 9. 5. 8. 8. 3. 3. 3. 3.
  3. 3. 3. 3. 3. 3.]
 [7. 2. 3. 1. 1. 1. 1. 1. 1. 1. 7. 7. 7. 7. 7. 7. 9. 3. 8. 8. 3. 3. 3. 3.
  3. 3. 3. 3. 3. 3.]
 [2. 2. 2. 6. 2. 1. 1. 1. 1. 1. 7. 7. 7. 7. 7. 9. 9. 9. 8. 8. 3. 3. 3. 3.
  3. 3. 3. 8. 8. 8.]
 [2. 4. 6. 6. 6. 1. 1. 1. 1. 1. 1. 8. 7. 9. 9. 9. 9. 9. 8. 8. 3. 3. 5. 5.
  5. 5. 5. 5. 5. 5.]
 [5. 5. 6. 6. 6. 1. 1. 1. 1. 1. 1. 1. 8. 4. 9. 9. 9. 4. 4. 8. 3. 5. 5. 5.
  5. 5. 5. 5. 5. 5.]
 [5.

In [5]:
ds0=[]
ds1=[]
ds2=[]
ds3=[]
ds4=[]
ds5=[]
ds6=[]
ds7=[]
ds8=[]
ds9=[]
lbl0=[]
lbl1=[]
lbl2=[]
lbl3=[]
lbl4=[]
lbl5=[]
lbl6=[]
lbl7=[]
lbl8=[]
lbl9=[]

poscnt = 0
for i in range(len(training_data)):
    if(training_labels[i]==0):
        ds0.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl0.append(1)
            poscnt +=1
        else:
            lbl0.append(0)
            
    if(training_labels[i]==1):
        ds1.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl1.append(1)
            poscnt +=1
        else:
            lbl1.append(0)
            
    if(training_labels[i]==2):
        ds2.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl2.append(1)
            poscnt +=1
        else:
            lbl2.append(0)
            
    if(training_labels[i]==3):
        ds3.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl3.append(1)
            poscnt +=1
        else:
            lbl3.append(0)

    if(training_labels[i]==4):
        ds4.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl4.append(1)
            poscnt +=1
        else:
            lbl4.append(0)
            
            
    if(training_labels[i]==5):
        ds5.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl5.append(1)
            poscnt +=1
        else:
            lbl5.append(0)
            
    if(training_labels[i]==6):
        ds6.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl6.append(1)
            poscnt +=1
        else:
            lbl6.append(0)
            
    if(training_labels[i]==7):
        ds7.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl7.append(1)
            poscnt +=1
        else:
            lbl7.append(0)
            
            
    if(training_labels[i]==8):
        ds8.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl8.append(1)
            poscnt +=1
        else:
            lbl8.append(0)
                     
    
    
    
    if(training_labels[i]==9):
        ds9.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl9.append(1)
            poscnt +=1
        else:
            lbl9.append(0)
        
        
    