# **Embeddings Generation**

In [None]:
!pip install pytorch-lightning > /dev/null 2>&1
!pip install einops > /dev/null 2>&1
!pip install timm > /dev/null 2>&1

In [None]:
!rm -rf MixformerFromScratch
!git clone https://github.com/reeWorlds/MixformerFromScratch.git
!pip install -e "MixformerFromScratch"

import site
site.main()

In [None]:
if False:
  import os
  os._exit(0)

In [None]:
import torch
import pytorch_lightning as pl
import numpy as np
import os
import gc

import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from einops import rearrange

from Mixformer import st2_ae

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_prefix = '/content/drive/My Drive/Data/DiplomeGenerated/Stage2'

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, _data):
        self._data = _data

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        return self._data[idx]

In [None]:
class LightningMixFormer(pl.LightningModule):
  def __init__(self):
    super().__init__()
    config = st2_ae.ConfigGeneration.make_ae_config()
    self.model = st2_ae.Autoencoder(config)

  def forward(self, _data):
    return self.model.forward_encoder(_data)

In [None]:
checkpoint_path = os.path.join(data_prefix, f'models/model_ae.ckpt')
model = LightningMixFormer.load_from_checkpoint(checkpoint_path=checkpoint_path)
model = model.eval().to('cuda')

In [None]:
train_patches_nums = list(range(21)) # up to 21

def get_tensor_by_path(file_path, size, shape, dtype):
  mmapped_array = np.memmap(file_path, dtype=dtype, mode='r', shape=(size,))
  tensor = torch.from_numpy(mmapped_array)
  return tensor.reshape(*shape)

def get_data_by_num(path_num):
  data_path = os.path.join(data_prefix, f'patch{path_num}_64x64.bin')
  data_size = 10000 * 64 * 64 * 3
  data_tensor = get_tensor_by_path(data_path, data_size, (10000, 64, 64, 3), np.float32)
  return data_tensor

device = torch.device('cuda:0')

for patch_num in train_patches_nums:
  d = get_data_by_num(patch_num)
  dataset = MyDataset(d)
  dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=False, num_workers=2)
  outputs = []
  for batch in dataloader:
    batch = batch.to(device)
    output = model(batch).to('cpu')
    outputs.append(output.clone().detach())
  outputs = torch.cat(outputs, dim=0)
  patch_path = os.path.join(data_prefix, f"patch{patch_num}_embd.pt")
  torch.save(outputs, patch_path)
  print(f"patch={patch_num} has shape {outputs.shape}")

# **Clustering**

In [None]:
import torch
import os
import numpy as np
from sklearn.cluster import KMeans

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_prefix = '/content/drive/My Drive/Data/DiplomeGenerated/Stage2'

In [None]:
train_patches_nums = list(range(21)) # up to 21

list_d = []

for patch_num in train_patches_nums:
  d_path = os.path.join(data_prefix, f'patch{patch_num}_embd.pt')
  d = torch.load(d_path)
  list_d.append(d)
  if patch_num % 5 == 0:
    print(f"loaded patch={patch_num}")

data = torch.cat(list_d, dim=0).numpy()
print(f"data shape is {data.shape}")

In [None]:
n_clusters = 20
best_score = 0
best_labels = None
for rs in range(250):
  kmeans = KMeans(n_clusters=n_clusters, n_init=1, random_state=rs)
  kmeans.fit(data)
  t_labels = kmeans.labels_
  t_cluster_counts = np.bincount(t_labels)
  t_score = min(t_cluster_counts)
  if t_score > best_score:
    best_score = t_score
    best_labels = t_labels
    print(f"New best score {best_score} at rs = {rs}")

In [None]:
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, n_init=1, random_state=146)
kmeans.fit(data)
labels = kmeans.labels_

In [None]:
cluster_counts = np.bincount(labels)
print(cluster_counts)

In [None]:
labels_pt = torch.tensor(labels)
for patch_num in range(21):
  l = 10000 * patch_num
  r = 10000 * (1 + patch_num)
  sub_data = labels_pt[l:r].to(dtype=torch.int64)
  sub_data_path = os.path.join(data_prefix, f"patch{patch_num}_labels.pt")
  torch.save(sub_data, sub_data_path)

# **Plot pictures in cluster**

In [None]:
import torch
import math
import os
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_prefix = '/content/drive/My Drive/Data/DiplomeGenerated/Stage2'

In [None]:
def get_tensor_by_path(file_path, size, shape, dtype):
  mmapped_array = np.memmap(file_path, dtype=dtype, mode='r', shape=(size,))
  tensor = torch.from_numpy(mmapped_array)
  return tensor.reshape(*shape)

def get_data():
  img_path = os.path.join(data_prefix, f'patch0_64x64.bin')
  img_size = 10000 * 64 * 64 * 3
  img_tensor = get_tensor_by_path(img_path, img_size, (10000, 64, 64, 3), np.float32)
  lbl_path = os.path.join(data_prefix, f'patch0_labels.pt')
  lbl_tensor = torch.load(lbl_path)
  return img_tensor, lbl_tensor

data_images, data_labels = get_data()

print(f'images.shape = {data_images.shape}')
print(f'labels.shape = {data_labels.shape}')

In [None]:
list_idx = {i: [] for i in range(20)}
for i in range(data_images.shape[0]):
  list_idx[data_labels[i].item()].append(i)

In [None]:
def plot_images(_idxs):
  plt.clf()
  n = int(math.sqrt(len(_idxs)))
  images = [data_images[index].numpy() for index in _idxs]
  fig, ax = plt.subplots(n, n, figsize=(6, 6))
  for i in range(n):
    for j in range(n):
      ax[i, j].imshow(images[i * n + j])
      ax[i, j].set_title(f'Image {_idxs[i * n + j]}')
  plt.show()

In [None]:
idxs = list_idx[5][0:16]
plot_images(idxs)