In [1]:
import numpy as np 
import cv2
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
from minisom import MiniSom
from collections import defaultdict
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [2]:
# Define the path to the root folder of your dataset
root_folder = "MNIST Dataset/"

# Define lists to store the image data and labels
data = []
labels = []

# Loop over the folders in the root folder
for i in range(10):
    folder_path = root_folder + str(i) + "/"

    # Get the label corresponding to the folder name
    label = i

    # Loop over the images in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg'):
            image_path = os.path.join(folder_path, filename)

            # Read the image using OpenCV and convert it to grayscale
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Add the image and label to the dataset lists
            data.append(image)
            labels.append(label)

# Convert the data and labels lists to numpy arrays
data = np.array(data)
labels = np.array(labels)

# Split the data into training and testing sets, with 90% of each class for training
train_data = []
train_labels = []
test_data = []
test_labels = []

for i in range(10):
    class_data = data[labels == i]
    class_labels = labels[labels == i]

    train_class_data, test_class_data, train_class_labels, test_class_labels = train_test_split(class_data, class_labels, test_size=0.1)

    train_data.append(train_class_data)
    train_labels.append(train_class_labels)
    test_data.append(test_class_data)
    test_labels.append(test_class_labels)

# Concatenate the training and testing data and labels for each class
train_data = np.concatenate(train_data)
train_labels = np.concatenate(train_labels)
test_data = np.concatenate(test_data)
test_labels = np.concatenate(test_labels)

# Create a pandas dataframe to store the data and labels
train_df = pd.DataFrame(train_data.reshape(train_data.shape[0], -1))
train_df['label'] = train_labels

test_df = pd.DataFrame(test_data.reshape(test_data.shape[0], -1))
test_df['label'] = test_labels

#shuffle data frame:
train_df = train_df.sample(frac = 1)
test_df = test_df.sample(frac = 1)


In [3]:
# prepare Train and Test  Data and labels
TrainLabels = train_df['label']
TrainData = train_df.drop('label',axis=1)
TestLabels = test_df['label']
TestData = test_df.drop('label',axis=1)
TrainData = np.array(TrainData)
TrainLabels = np.array(TrainLabels)
TestLabels = np.array(TestLabels)
TestData = np.array(TestData)

In [4]:
# Load the training and test data
training_data = TrainData
test_data = TestData
training_labels = TrainLabels
test_labels = TestLabels

# Build SOM with 30x30 neurons
som = MiniSom(30, 30, len(training_data[0]), sigma=1.0, learning_rate=0.5, neighborhood_function='bubble', topology='rectangular')

# Train the SOM with all training set images
som.random_weights_init(training_data)
for i in tqdm(range(1000)):
    som.train_random(training_data, num_iteration=1)

# Get cluster labels for each training data point
train_clusters = som.win_map(training_data)
train_cluster_labels = []
print(len(training_data))
for i in range(len(training_data)):
    cluster = train_clusters[(som.winner(training_data[i]))]
    indices = [np.where(np.all(training_data == x, axis=1))[0][0] for x in cluster]
    labels = [training_labels[index] for index in indices]
    counts = np.bincount(labels)
    train_cluster_labels.append(np.argmax(counts))
    if(i % 500 == 0):
        print(i)
# print("Training data cluster labels:", train_cluster_labels)

# Get the number of images of each class in each cluster
num_classes = len(np.unique(training_labels))
cluster_class_counts = []
for i in range(num_classes):
    cluster_class_counts = []
    print(i)
    for j in range(30):
        cluster = train_clusters[(i,j)]
        counts = []
        # print(j)
        for k in range(len(training_data)):
            if any(np.all(training_data[k] == x) for x in cluster):
                indices = np.where(np.all(training_data == training_data[k], axis=1))[0]
                if indices.size > 0:
                    index = indices[0]
                    counts.append(training_labels[index])
        counts = np.bincount(counts, minlength=num_classes)
        cluster_class_counts.append(counts)

print('done')

# Assign a label to each cluster based on the class with the highest count
cluster_labels = []
for counts in cluster_class_counts:
    cluster_labels.append(np.argmax(counts))
# print("Cluster labels:", cluster_labels)

# Compute Davies-Bouldin index on the clustering result
dbi_score = davies_bouldin_score(training_data, train_cluster_labels)
print("Davies-Bouldin index:", dbi_score)

# Get test data accuracy
test_clusters = som.win_map(test_data)
test_cluster_labels = []
for i in range(len(test_data)):
    cluster = test_clusters[(som.winner(test_data[i]))]
    test_cluster_labels.append(np.argmax(np.bincount([test_labels[np.where(np.all(test_data == x, axis=1))[0][0]] for x in cluster if np.all(test_data == x, axis=1).any()])))

test_accuracy = np.mean(np.array(test_cluster_labels) == np.array(test_labels))
print("Test data accuracy:", test_accuracy)

# Get train data accuracy
train_accuracy = np.mean(np.array(train_cluster_labels) == np.array(training_labels))
print("Training data accuracy:", train_accuracy)



100%|██████████| 1000/1000 [00:02<00:00, 384.88it/s]


18000
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
0
1
2
3
4
5
6
7
8
9
done
Davies-Bouldin index: 4.0326464254815
Test data accuracy: 0.907
Training data accuracy: 0.8818333333333334


In [18]:
ds0=[]
ds1=[]
ds2=[]
ds3=[]
ds4=[]
ds5=[]
ds6=[]
ds7=[]
ds8=[]
ds9=[]
lbl0=[]
lbl1=[]
lbl2=[]
lbl3=[]
lbl4=[]
lbl5=[]
lbl6=[]
lbl7=[]
lbl8=[]
lbl9=[]

poscnt = 0
for i in range(len(training_data)):
    if(training_labels[i]==0):
        ds0.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl0.append(1)
            poscnt +=1
        else:
            lbl0.append(0)
            
    if(training_labels[i]==1):
        ds1.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl1.append(1)
            poscnt +=1
        else:
            lbl1.append(0)
            
    if(training_labels[i]==2):
        ds2.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl2.append(1)
            poscnt +=1
        else:
            lbl2.append(0)
            
    if(training_labels[i]==3):
        ds3.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl3.append(1)
            poscnt +=1
        else:
            lbl3.append(0)

    if(training_labels[i]==4):
        ds4.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl4.append(1)
            poscnt +=1
        else:
            lbl4.append(0)
            
            
    if(training_labels[i]==5):
        ds5.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl5.append(1)
            poscnt +=1
        else:
            lbl5.append(0)
            
    if(training_labels[i]==6):
        ds6.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl6.append(1)
            poscnt +=1
        else:
            lbl6.append(0)
            
    if(training_labels[i]==7):
        ds7.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl7.append(1)
            poscnt +=1
        else:
            lbl7.append(0)
            
            
    if(training_labels[i]==8):
        ds8.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl8.append(1)
            poscnt +=1
        else:
            lbl8.append(0)
                     
    
    
    
    if(training_labels[i]==9):
        ds9.append(training_data[i])
        if(train_cluster_labels[i] == train_labels[i]):
            lbl9.append(1)
            poscnt +=1
        else:
            lbl9.append(0)
        
        
    

In [24]:
df = pd.DataFrame(ds0)
df.to_excel('ds0.xlsx', index=False)
df = pd.DataFrame(ds1)
df.to_excel('ds1.xlsx', index=False)
df = pd.DataFrame(ds2)
df.to_excel('ds2.xlsx', index=False)
df = pd.DataFrame(ds3)
df.to_excel('ds3.xlsx', index=False)
df = pd.DataFrame(ds4)
df.to_excel('ds4.xlsx', index=False)
df = pd.DataFrame(ds5)
df.to_excel('ds5.xlsx', index=False)
df = pd.DataFrame(ds6)
df.to_excel('ds6.xlsx', index=False)
df = pd.DataFrame(ds7)
df.to_excel('ds7.xlsx', index=False)
df = pd.DataFrame(ds8)
df.to_excel('ds8.xlsx', index=False)
df = pd.DataFrame(ds9)
df.to_excel('ds9.xlsx', index=False)

In [25]:
df = pd.DataFrame(lbl0)
df.to_excel('lbl0.xlsx', index=False)
df = pd.DataFrame(lbl1)
df.to_excel('lbl1.xlsx', index=False)
df = pd.DataFrame(lbl2)
df.to_excel('lbl2.xlsx', index=False)
df = pd.DataFrame(lbl3)
df.to_excel('lbl3.xlsx', index=False)
df = pd.DataFrame(lbl4)
df.to_excel('lbl4.xlsx', index=False)
df = pd.DataFrame(lbl5)
df.to_excel('lbl5.xlsx', index=False)
df = pd.DataFrame(lbl6)
df.to_excel('lbl6.xlsx', index=False)
df = pd.DataFrame(lbl7)
df.to_excel('lbl7.xlsx', index=False)
df = pd.DataFrame(lbl8)
df.to_excel('lbl8.xlsx', index=False)
df = pd.DataFrame(lbl9)
df.to_excel('lbl9.xlsx', index=False)