In [54]:
import torch
from sklearn.preprocessing import StandardScaler
import torchvision.transforms as T
from torch import nn
from umap.umap_ import UMAP
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.cluster import KMeans

Redefine the model architecture but instead of returning the last layer, we return the penultimate layer

In [55]:
class LeNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = torch.nn.Sequential(
            nn.Conv2d(3,9,(5,5)),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(9,18,(5,5)),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Flatten(1,-1),
            nn.Linear(288,64)
            )
        self.relu = nn.ReLU()
        self.l = nn.Linear(64,5)

    def forward(self, x):
        x = self.net(x)
        return x


Load the model and the training data

In [56]:
model = LeNet()
model.load_state_dict(torch.load('Saved Data/models/model.pt', weights_only=True))
model.eval()
train_images = torch.load('Saved Data/Data/train_images.pt')
train_labels = torch.load('Saved Data/Data/train_labels.pt')
test_images = torch.load('Saved Data/Data/test_images.pt')
test_labels = torch.load('Saved Data/Data/test_labels.pt')

  train_images = torch.load('Saved Data/Data/train_images.pt')
  train_labels = torch.load('Saved Data/Data/train_labels.pt')
  test_images = torch.load('Saved Data/Data/test_images.pt')
  test_labels = torch.load('Saved Data/Data/test_labels.pt')


Make sure the training data was not corrupted

In [57]:
test_images.shape

torch.Size([10000, 3, 28, 28])

Sort all the images by labels, and keep track of the index for later use

In [58]:

transform = T.Compose([
    T.Normalize(mean=(0.5,0.5,0.5), std=(0.5,0.5,0.5))
])
train_output = [[] for _ in range(5)]
train_indices = [[] for _ in range(5)]
test_output = [[] for _ in range(5)]
test_indices = [[] for _ in range(5)]
with torch.no_grad():
    for index,(x, label) in enumerate(zip(train_images, train_labels)):
        train_output[label].append(model(transform(torch.unsqueeze(x,0)))[0])
        train_indices[label].append(index)
    for index, (x, label) in enumerate(zip(test_images, test_labels)):
        test_output[label].append(model(transform(torch.unsqueeze(x,0)))[0])
        test_indices[label].append(index)

In [59]:
print(train_output[0][0])

tensor([  5.6822,   0.7227,  -3.3618,  -2.8614,  14.4085,  22.6241,  -2.7060,
         -4.6660,  -4.1816,  -2.0434,   3.4939,  -2.3240,  -2.4109,  -6.6759,
          9.2076,   9.0879,  -6.5901,  -1.6390,  -3.6716,   8.9184,   8.5940,
         15.7716,   0.0287,  -6.3466,   2.6248,   0.9239,   1.3071,   0.1489,
         11.7156, -11.5195,  -1.1184,  10.9707,  -2.9344,  -7.8740,   8.0705,
         -2.9119,   4.5426,  -2.0824,   5.6963,  10.0487,   9.3382, -14.4961,
          9.7506,  -2.6642,   6.8131,   0.8836,   6.7535,  -4.0043,  -1.6445,
          5.5787,  -2.2131, -10.3292,   6.1588,  -1.7774,   3.7803,   1.2636,
         -2.3724,   4.2579,   4.4073,   1.5223,  -3.9662,  10.7345,   5.3141,
         -2.8403])


Create a reducer to transform n-dimensional tensors in 2-dimensional tensors

In [60]:
reducer1 = UMAP()

Normalize the data to aid with dimensionality reducing speeds

In [None]:
train = [StandardScaler().fit_transform(train_output[i]) for i in range(5)]
test = [StandardScaler().fit_transform(test_output[i]) for i in range(5)]

Reduce the dimensionality of the data

In [None]:
train = [reducer1.fit_transform(train[i]) for i in range(5)]
test = [reducer1.fit_transform(test[i]) for i in range(5)]


(10133, 2)

For every single group of labels, figure out the number of clusters and their centers by trying out 2-10 clusters and seeing which one results in the highest silhouette score. Then, for each cluster that we found, break it into F subcluster, F in this case being 10. Record all the centers of each cluster

In [None]:
bk = [0,0,0,0,0]
f = 10
centers = []
for i in range(5):
    bs = 0
    for k in range(2,10):
        gm = GaussianMixture(k).fit_predict(e[i])
        score = silhouette_score(train[i], gm)
        if score > bs:
            bs = score
            bk[i] = k
    predictor = GaussianMixture(bk[i]).fit(train[i])
    temp = []
    labels = predictor.predict(train[i])
    for cluster_value in np.unique(labels):
        cluster = train[i][labels == cluster_value]
        subclusters = GaussianMixture(f).fit(cluster).means_
        for subsubclusters in subclusters:
            temp.append(subsubclusters)
    centers.append(temp)

Label each image according to which subcluster it belongs to. We add num to make sure that images that belong to different labels dont have the same subcluster number

In [None]:
train_labels = np.array([])
test_labels = np.array([])
num = 0 #just making sure that each group has a different value
for i in range(5):
    train_labels = np.concatenate((train_labels, KMeans(len(centers[i]), init = centers[i]).fit_predict(train[i])  + num))
    test_labels  = np.concatenate(( test_labels, KMeans(len(centers[i]), init = centers[i]).fit_predict(test[i]) + num))
    num += len(centers[i])


Sort the array of subcluster labels in order using the indicies we recorded earlier

In [65]:
train_i = np.array([])
test_i = np.array([])
for i in range(5):
    train_i = np.concatenate((train_i, train_indices[i]))
    test_i  = np.concatenate((test_i, test_indices[i]))


train_idx = np.argsort(train_i)
train_labels = np.array(train_labels)[train_idx]
train_index = np.array(train_i)[train_idx]

test_idx = np.argsort(test_i)
test_labels = np.array(test_labels)[test_idx]
test_index = np.array(test_i)[test_idx]

Save the data

In [66]:
df_train = pd.DataFrame({'Labels': train_labels})
df_test = pd.DataFrame({'Labels' : test_labels})
df_train.to_csv('Saved Data/Groups/training_groups.csv', index= False)
df_test.to_csv('Saved Data/Groups/testing_groups.csv', index = False)