# <center>This `.ipynb` file contains the code for extracting the latent space using the second finetuning of the `Classifier`</center>

### 1. Import the required libraries

In [1]:
import torch
from torch import nn

from torchvision import transforms
from torchvision.utils import make_grid

import sys
import os
import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from tqdm import tqdm

sys.path.insert(0, '..')
from pfiles.unet_cond_base import UNet
from pfiles.vqvae import VQVAE
from pfiles.linear_noise_scheduler import LinearNoiseScheduler

### 2. Define the device

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device is:', device)

Device is: cuda


### 3. Set different hyperparameters

In [3]:
seed = 765

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

if device == 'cuda':
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [4]:
select_batch_size = 16
rgb_input = 3
z_channels = 16
n_clusters = 14 # change it to 10, 11, 12, 13, 15, or 16 for other partitions

### 4. Load the dataset

In [5]:
dir_src = '/project/dsc-is/nono/Documents/kpc/dat0'
data_src = 'slice128_Block2_11K.npy'

print(os.path.join(dir_src, data_src))

kpc_dataset = np.load(os.path.join(dir_src, data_src))
kpc_dataset = kpc_dataset[:, 0, :, :, :]

print(kpc_dataset.shape)
N_SAMPLE, HEIGHT, WIDTH, CHANNELS = kpc_dataset.shape

/project/dsc-is/nono/Documents/kpc/dat0/slice128_Block2_11K.npy
(11000, 128, 128, 3)


In [6]:
index_range = np.arange(N_SAMPLE)
split = np.array_split(index_range, 11)
test_dataset = split[10]
training_dataset = np.setdiff1d(index_range, test_dataset)

In [7]:
print('Length of the training dataset:', len(training_dataset))
print('Length of the test dataset:', len(test_dataset))

Length of the training dataset: 10000
Length of the test dataset: 1000


### 5. Custom functions for extracting batches of samples from the dataset

In [8]:
def make_batch_list(idx, n_batch=10, batch_size=None, shuffle=True):
    if shuffle:
        np.random.shuffle(idx)
    if batch_size is not None:
        n_batch = len(idx) // batch_size
    batch_list = np.array_split(idx, n_batch)
    return batch_list

In [9]:
transform = transforms.ToTensor()

def generate_batch(idx, kpc_dataset):
    tmp = []
    for i in idx:
        xxx = transform(kpc_dataset[i])
        tmp.append(xxx)
    xxx_batch = torch.stack(tmp, dim=0)
    return xxx_batch

### 6. Set up directory for saving models

In [10]:
task_name = 'models_14'

if not os.path.exists(task_name):
    os.mkdir(task_name)

### 7. Neural network for deep learning-based clustering

In [11]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        
        self.classifier = nn.Sequential()
        self.classifier.add_module('conv1', nn.Conv2d(in_channels=z_channels, out_channels=128, kernel_size=4, stride=2,
                                                      padding=1))
        self.classifier.add_module('bnor1', nn.BatchNorm2d(num_features=128, affine=True, track_running_stats=True))
        self.classifier.add_module('lrel1', nn.LeakyReLU(negative_slope=0.1, inplace=True))
        self.classifier.add_module('conv2', nn.Conv2d(in_channels=128, out_channels=128, kernel_size=4, stride=2, padding=1))
        self.classifier.add_module('bnor2', nn.BatchNorm2d(num_features=128, affine=True, track_running_stats=True))
        self.classifier.add_module('lrel2', nn.LeakyReLU(negative_slope=0.1, inplace=True))
        self.classifier.add_module('conv3', nn.Conv2d(in_channels=128, out_channels=128, kernel_size=4, stride=2, padding=1))
        self.classifier.add_module('lrel3', nn.LeakyReLU(negative_slope=0.1, inplace=True))
        self.classifier.add_module('conv4', nn.Conv2d(in_channels=128, out_channels=n_clusters, kernel_size=4, stride=1,
                                                      padding=0))
        self.classifier.add_module('lrel4', nn.LeakyReLU(negative_slope=0.1, inplace=True))
        
    def forward(self, lat):
        out = self.classifier(lat)
        return out

### 8. Instantiate `VQVAE`, and `Classifier` architecture

In [12]:
vq_vae = VQVAE(im_channels=rgb_input).to(device)
vq_vae.eval()
print('Loaded vq_vae checkpoint')
vq_vae.load_state_dict(torch.load(os.path.join('../kpc_ldm', 'vqvae_autoencoder_ckpt.pth'), map_location=device,
                                  weights_only=True))

Loaded vq_vae checkpoint


<All keys matched successfully>

In [13]:
model_cl = Classifier().to(device)
model_cl.eval()
print('Loaded model_cl finetuning2 checkpoint')
model_cl.load_state_dict(torch.load(os.path.join(task_name, 'classifier_finetuning2_ckpt_20250128_70_14.pth'),
                                    map_location=device, weights_only=True))

Loaded model_cl finetuning2 checkpoint


<All keys matched successfully>

### 9. Extract latent space

In [14]:
batch_list = make_batch_list(training_dataset, batch_size=select_batch_size, shuffle=False)
hhh_list = []
with torch.inference_mode():
    for idx_tmp in batch_list:
        xxx_tmp = generate_batch(idx_tmp, kpc_dataset)
        im = xxx_tmp.to(device)
        vq_vae.eval()
        im, _ = vq_vae.encode(im)
        hhh_list.append(torch.softmax(im, dim=1).detach().cpu().numpy())
    hhh = np.concatenate(hhh_list, axis=0)
    print(hhh.shape)

(10000, 16, 32, 32)


In [15]:
a = np.mean(hhh, axis=3)
a = np.mean(a, axis=2)
a.shape

(10000, 16)

### 10. Access the cluster space

In [16]:
batch_list = make_batch_list(training_dataset, batch_size=select_batch_size, shuffle=False)
vvv_list = []
with torch.inference_mode():
    for idx_tmp in batch_list:
        xxx_tmp = generate_batch(idx_tmp, kpc_dataset)
        im = xxx_tmp.to(device)
        vq_vae.eval()
        model_cl.eval()
        im, _ = vq_vae.encode(im)
        out_cl = model_cl(im)
        vvv_list.append(torch.softmax(out_cl.reshape((-1, n_clusters)), dim=1).detach().cpu().numpy())
    vvv_10k = np.concatenate(vvv_list, axis=0)
    print(vvv_10k.shape)

(10000, 14)


In [17]:
C = []
D = []

for j in range(len(vvv_10k)):
    hemp1 = vvv_10k[j, :]
    hemp2 = np.argmax(hemp1)
    hemp3 = hemp1[hemp2]
    C.append(hemp2)
    D.append(hemp3)

In [18]:
DF = pd.DataFrame(a)
DF['16'] = C
DF['17'] = D
DF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.057595,0.067675,0.058129,0.042794,0.088442,0.053543,0.052616,0.055121,0.045779,0.049125,0.147163,0.070993,0.059063,0.038281,0.057689,0.055993,7,0.816475
1,0.026985,0.044292,0.073882,0.046256,0.029100,0.095295,0.029674,0.058982,0.041400,0.074020,0.216426,0.051991,0.098723,0.030846,0.039377,0.042751,11,0.523989
2,0.046284,0.076820,0.073665,0.052024,0.067470,0.056680,0.052943,0.051458,0.039607,0.045904,0.109663,0.085190,0.079527,0.045343,0.068510,0.048912,5,0.963012
3,0.047946,0.064063,0.072154,0.052653,0.081443,0.052867,0.045848,0.051086,0.042792,0.043281,0.131367,0.086240,0.075695,0.043938,0.062331,0.046297,1,0.996872
4,0.047029,0.076794,0.071956,0.051399,0.069204,0.062240,0.054918,0.051438,0.038797,0.043793,0.099919,0.085383,0.079885,0.046962,0.072355,0.047928,11,0.866706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.044612,0.065441,0.071401,0.051777,0.062607,0.068658,0.049010,0.052594,0.040162,0.048244,0.130337,0.081689,0.079808,0.043182,0.063388,0.047090,10,0.423563
9996,0.056497,0.065437,0.053226,0.039423,0.089269,0.064249,0.051970,0.057516,0.047392,0.046376,0.151992,0.064658,0.059062,0.038185,0.061075,0.053672,2,0.891099
9997,0.054370,0.064864,0.056473,0.041826,0.088380,0.068515,0.051165,0.057697,0.047390,0.044725,0.140883,0.068395,0.061810,0.040132,0.061950,0.051425,4,0.742577
9998,0.056979,0.064283,0.054015,0.038850,0.095660,0.059166,0.050250,0.057395,0.048686,0.047209,0.160208,0.064804,0.056229,0.036626,0.055360,0.054280,12,0.934176


### 11. Save in a `CSV` file

In [19]:
DF.to_csv(f'HHH{n_clusters}_C{n_clusters}_D{n_clusters}.csv', index=None, header=None)