In [2]:
import numpy as np 
import cv2
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
import torch
import torchvision


In [3]:
print(torch.__version__)

2.0.0


In [4]:
# Define the path to the root folder of your dataset
root_folder = "MNIST Dataset/"

# Define lists to store the image data and labels
data = []
labels = []

# Loop over the folders in the root folder
for i in range(10):
    folder_path = root_folder + str(i) + "/"

    # Get the label corresponding to the folder name
    label = i

    # Loop over the images in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg'):
            image_path = os.path.join(folder_path, filename)

            # Read the image using OpenCV and convert it to grayscale
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Add the image and label to the dataset lists
            data.append(image)
            labels.append(label)

# Convert the data and labels lists to numpy arrays
data = np.array(data)
labels = np.array(labels)

# Split the data into training and testing sets, with 90% of each class for training
train_data = []
train_labels = []
test_data = []
test_labels = []

for i in range(10):
    class_data = data[labels == i]
    class_labels = labels[labels == i]

    train_class_data, test_class_data, train_class_labels, test_class_labels = train_test_split(class_data, class_labels, test_size=0.1)

    train_data.append(train_class_data)
    train_labels.append(train_class_labels)
    test_data.append(test_class_data)
    test_labels.append(test_class_labels)

# Concatenate the training and testing data and labels for each class
train_data = np.concatenate(train_data)
train_labels = np.concatenate(train_labels)
test_data = np.concatenate(test_data)
test_labels = np.concatenate(test_labels)

# Create a pandas dataframe to store the data and labels
train_df = pd.DataFrame(train_data.reshape(train_data.shape[0], -1))
train_df['label'] = train_labels

test_df = pd.DataFrame(test_data.reshape(test_data.shape[0], -1))
test_df['label'] = test_labels


In [5]:
# prepare Train and Test  Data and labels
TrainLabels = train_df['label']
TrainData = train_df.drop('label',axis=1)
TestLabels = test_df['label']
TestData = test_df.drop('label',axis=1)
TrainData = np.array(TrainData)
TrainLabels = np.array(TrainLabels)

In [6]:
TrainData.shape

(18000, 784)

In [None]:
# Load the MNIST dataset
TrainData = TrainData
TestData = TestData

# Preprocess the dataset
train_images = TrainData / 255.0
test_images = TestData/ 255.0

train_images = torch.DoubleTensor(train_images)
# test_images = torch.DoubleTensor(test_images)


train_labels = TrainLabels
test_labels = TestLabels

num_classes = 10  # number of classes in the dataset

# Part 1: Build a SOM with one neuron for each class
class SOM(torch.nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.weights = torch.nn.Parameter(torch.randn(num_classes, 784))
        
    
    def forward(self, x):
        x = x.view(-1, 784)
        # distances = torch.cdist(x, self.weights)
        distances = torch.cdist(x.double(), self.weights.double())

        winners = torch.argmin(distances, dim=1)
        return winners

# Part 2: Specify how many images of each class are placed in each cluster
# and specify the label for each cluster
s = SOM(num_classes)
s.train()
for epoch in range(10):
    for i, (images, labels) in enumerate(zip(train_images, train_labels)):
        # winner = s(images.unsqueeze(0))
        winner = s(torch.tensor(images).unsqueeze(0))

        s.weights[winner] += images.flatten() - s.weights[winner]

# Compute the cluster sizes and labels
cluster_sizes = torch.zeros(num_classes)
cluster_labels = torch.zeros(num_classes)
for i in range(num_classes):
    indices = (s.weights == s.weights[i]).all(dim=1).nonzero().squeeze()
    cluster_sizes[i] = indices.numel()
    if indices.numel() == 0:
        cluster_labels[i] = -1
    else:
        cluster_labels[i] = train_labels[indices[0]]

# Print the cluster sizes and labels
print('Cluster Sizes:', cluster_sizes)
print('Cluster Labels:', cluster_labels)

# Part 3: Compute Davies-Bouldin Index
normalized_weights = s.weights / torch.norm(s.weights, dim=1, keepdim=True)
weighted_distances = normalized_weights - normalized_weights[:, None, :]
distances = torch.norm(weighted_distances, dim=2)
intra_cluster_distances = torch.zeros(num_classes)
for i in range(num_classes):
    indices = (train_labels == i).nonzero().squeeze()
    if indices.numel() == 0:
        intra_cluster_distances[i] = float('inf')
    else:
        intra_cluster_distances[i] = distances[i][indices].mean()
inter_cluster_distances = torch.zeros(num_classes, num_classes)
for i in range(num_classes):
    for j in range(num_classes):
        if i == j:
            inter_cluster_distances[i][j] = float('inf')
        else:
            indices_i = (train_labels == i).nonzero().squeeze()
            indices_j = (train_labels == j).nonzero().squeeze()
            inter_cluster_distances[i][j] = (intra_cluster_distances[i] + intra_cluster_distances[j]) / distances[i][j]
DBI = torch.max(inter_cluster_distances.sum(dim=1) / cluster_sizes)

print('Davies-Bouldin Index:', DBI.item())
