In [1]:
import numpy as np 
import cv2
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
from minisom import MiniSom
from collections import defaultdict
from sklearn.metrics import accuracy_score

In [2]:
# Define the path to the root folder of your dataset
root_folder = "MNIST Dataset/"

# Define lists to store the image data and labels
data = []
labels = []

# Loop over the folders in the root folder
for i in range(10):
    folder_path = root_folder + str(i) + "/"

    # Get the label corresponding to the folder name
    label = i

    # Loop over the images in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg'):
            image_path = os.path.join(folder_path, filename)

            # Read the image using OpenCV and convert it to grayscale
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Add the image and label to the dataset lists
            data.append(image)
            labels.append(label)

# Convert the data and labels lists to numpy arrays
data = np.array(data)
labels = np.array(labels)

# Split the data into training and testing sets, with 90% of each class for training
train_data = []
train_labels = []
test_data = []
test_labels = []

for i in range(10):
    class_data = data[labels == i]
    class_labels = labels[labels == i]

    train_class_data, test_class_data, train_class_labels, test_class_labels = train_test_split(class_data, class_labels, test_size=0.1)

    train_data.append(train_class_data)
    train_labels.append(train_class_labels)
    test_data.append(test_class_data)
    test_labels.append(test_class_labels)

# Concatenate the training and testing data and labels for each class
train_data = np.concatenate(train_data)
train_labels = np.concatenate(train_labels)
test_data = np.concatenate(test_data)
test_labels = np.concatenate(test_labels)

# Create a pandas dataframe to store the data and labels
train_df = pd.DataFrame(train_data.reshape(train_data.shape[0], -1))
train_df['label'] = train_labels

test_df = pd.DataFrame(test_data.reshape(test_data.shape[0], -1))
test_df['label'] = test_labels

#shuffle data frame:
train_df = train_df.sample(frac = 1)
test_df = test_df.sample(frac = 1)


In [3]:
# prepare Train and Test  Data and labels
TrainLabels = train_df['label']
TrainData = train_df.drop('label',axis=1)
TestLabels = test_df['label']
TestData = test_df.drop('label',axis=1)
TrainData = np.array(TrainData)
TrainLabels = np.array(TrainLabels)
TestLabels = np.array(TestLabels)
TestData = np.array(TestData)

In [4]:
#true Winner takes all approach with only 10 clusters 
# Load the training and test data
training_data = TrainData
test_data = TestData
training_labels = TrainLabels
test_labels = TestLabels

# Flatten the training and test data and normalize it:
training_data = training_data.reshape(training_data.shape[0], -1) / 255.
test_data = test_data.reshape(test_data.shape[0], -1) / 255.

# Define the parameters for the SOM:
input_len = 784  # number of features
classes = 10     # number of classes
som_size = 10    # size of the SOM
sigma = 1        # neighborhood radius
learning_rate = 0.5

# Implement the winner-takes-all approach:
# Create a SOM with one neuron for each class
som = MiniSom(som_size, som_size, input_len, sigma=sigma, learning_rate=learning_rate, 
              neighborhood_function='gaussian', random_seed=0)
som.pca_weights_init(training_data)
som.train_batch(training_data, 300000, verbose=True)
# Determine the number of images of each class that are placed in each cluster
class_counts = np.zeros((som_size, som_size, classes))
for index, label in np.ndenumerate(training_labels):
    i, j = som.winner(training_data[index])
    class_counts[i, j, label] += 1

# Determine the label for each cluster
class_labels = np.zeros((som_size, som_size))
for i in range(som_size):
    for j in range(som_size):
        class_labels[i, j] = np.argmax(class_counts[i, j])
# Determine the final class labels
class_labels_final = np.zeros((classes,))
for i in range(som_size):
    for j in range(som_size):
        class_labels_final[int(class_labels[i, j])] = np.argmax(class_counts[i, j])
        
# Reshape the class_labels_final array to (10,1) or (1,10)
class_labels_final = class_labels_final.reshape((1, 10))  # or (10, 1)

# Compute the train accuracy
train_labels_pred = np.zeros(len(training_labels))
for i in range(len(training_data)):
    closest_neuron_dist, closest_neuron = som.winner(training_data[i])
    train_labels_pred[i] = class_labels_final[0, closest_neuron]
train_accuracy = accuracy_score(training_labels, train_labels_pred)
print('Train accuracy:', train_accuracy*100 , ' %')


# Compute the test accuracy
test_labels_pred = np.zeros(len(test_labels))
for i in range(len(test_data)):
    closest_neuron_dist, closest_neuron = som.winner(test_data[i])
    test_labels_pred[i] = class_labels_final[0, closest_neuron]
test_accuracy = accuracy_score(test_labels, test_labels_pred)
print('Test accuracy:', test_accuracy*100 , ' %')

# Compute the Euclidean distances between each training sample and each neuron in the SOM
weights = som.get_weights()
distances = np.zeros((len(training_data), som_size * som_size))
for i in range(len(training_data)):
    for j in range(som_size * som_size):
        distances[i, j] = np.linalg.norm(training_data[i] - weights[j // som_size, j % som_size])

# Assign each training sample to the neuron it is closest to
assignments = np.argmin(distances, axis=1)

# Compute the DBI for the SOM
dbi_wta = davies_bouldin_score(training_data, assignments)
print('DBI for Winner-Takes-All approach:', dbi_wta)

print("Cluster Labels:", class_labels_final)




 [ 300000 / 300000 ] 100% - 0:00:00 left 
 quantization error: 5.322994255490633
Train accuracy: 14.744444444444444  %
Test accuracy: 14.2  %
DBI for Winner-Takes-All approach: 2.8945307568222627
Cluster Labels: [[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]]


In [5]:
ds0=[]
ds1=[]
ds2=[]
ds3=[]
ds4=[]
ds5=[]
ds6=[]
ds7=[]
ds8=[]
ds9=[]
lbl0=[]
lbl1=[]
lbl2=[]
lbl3=[]
lbl4=[]
lbl5=[]
lbl6=[]
lbl7=[]
lbl8=[]
lbl9=[]

poscnt = 0
for i in range(len(training_data)):
    if(training_labels[i]==0):
        ds0.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl0.append(1)
            poscnt +=1
        else:
            lbl0.append(0)
            
    if(training_labels[i]==1):
        ds1.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl1.append(1)
            poscnt +=1
        else:
            lbl1.append(0)
            
    if(training_labels[i]==2):
        ds2.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl2.append(1)
            poscnt +=1
        else:
            lbl2.append(0)
            
    if(training_labels[i]==3):
        ds3.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl3.append(1)
            poscnt +=1
        else:
            lbl3.append(0)

    if(training_labels[i]==4):
        ds4.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl4.append(1)
            poscnt +=1
        else:
            lbl4.append(0)
            
            
    if(training_labels[i]==5):
        ds5.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl5.append(1)
            poscnt +=1
        else:
            lbl5.append(0)
            
    if(training_labels[i]==6):
        ds6.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl6.append(1)
            poscnt +=1
        else:
            lbl6.append(0)
            
    if(training_labels[i]==7):
        ds7.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl7.append(1)
            poscnt +=1
        else:
            lbl7.append(0)
            
            
    if(training_labels[i]==8):
        ds8.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl8.append(1)
            poscnt +=1
        else:
            lbl8.append(0)
                     
    
    
    
    if(training_labels[i]==9):
        ds9.append(training_data[i])
        if(train_labels_pred[i] == train_labels[i]):
            lbl9.append(1)
            poscnt +=1
        else:
            lbl9.append(0)
        
        
    