In [1]:
import numpy as np
import torch
torch.multiprocessing.set_sharing_strategy('file_system')
torch.cuda.is_available()
import os
from tqdm import tqdm
from utils.parse import parse_args, load_model, get_pca, get_tsne, get_clusters
from dataset.dataloader import data_loader
import gc
import yaml   


In [2]:
from scipy.spatial.distance import directed_hausdorff
from multiprocessing import Pool,cpu_count
from functools import partial

In [3]:
config = yaml.load(open("config/config_rep_objs.yaml"),Loader=yaml.FullLoader)
config['training']['train'] = False

In [4]:
config

{'dataset': {'path': '/mnt/data/das-sb/dataset_hdf5_processed',
  'type': 'abc',
  'num_point': 2048,
  'train_val_percent': 0.85,
  'train_test_split': 0.95},
 'model': {'path': '/home/das-sb/GIT/source_library/log/PointNet/2024-02-22_12-03/sinkhorn/PointNet/checkpoints/self_best_model.pth',
  'name': 'BYOL',
  'proj_dim': 256,
  'tau': 0.01,
  'K': 64,
  'angle': 0.0,
  'md': 'pn',
  'crop': [0.85, 0.85],
  'dims': 1024,
  'neighs': 20},
 'util': {'gpu': 0, 'workers': 8},
 'training': {'train': False,
  'unsup': True,
  'l_type': 'gl',
  'is_recon': True,
  'normal': False,
  'log_dir': 'PointNet',
  'batch_size': 32,
  'epoch': 200,
  'learning_rate': 0.001,
  'lr_decay': 0.5,
  'lr_patience': 5,
  'optimizer': 'AdamW',
  'decay_rate': 0.0001,
  'aug': 'jitter',
  'early_stopping': True,
  'early_stopping_patience': 10,
  'cb': False,
  'return_embedding': True},
 'pca': {'n_components': 512},
 'pca_2': {'n_components': 154},
 'results': {'dir_path': '/mnt/data/das-sb/results'}}

In [5]:

test_data_loader = data_loader(config)
with torch.cuda.device(config['util']['gpu']):
    ema_net = load_model(config)
    emb_all_points = []
    all_points = []
    for batch_id, test_data in tqdm(enumerate(test_data_loader, 0), total=len(test_data_loader), smoothing=0.9,desc = 'train dataset'):
        x1 = test_data[0]
        x1 = x1.cuda()
        x1 = x1.transpose(2, 1)
        emb =  ema_net(x1,return_embedding=config['training']['return_embedding'])
        emb_all_points.append(emb[0].detach().cpu().numpy())
        all_points.append(test_data[0].detach().cpu().numpy())
    del test_data_loader
    gc.collect()
    torch.cuda.empty_cache() 
del ema_net
gc.collect()
torch.cuda.empty_cache() 
all_points = np.concatenate(all_points,axis=0)
emb_all_points = np.concatenate(emb_all_points,axis=0)
print(f'Dimension of dataset {all_points.shape}')
print(f'Dimension of dataset embedding {emb_all_points.shape}')

Successfully load Abc with 49981 instances


train dataset: 100%|██████████| 1562/1562 [00:41<00:00, 37.54it/s]


Dimension of dataset (49981, 2048, 3)
Dimension of dataset embedding (49981, 1024)


In [6]:
emb_all_points = get_pca(n_components=config['pca']['n_components'],data=emb_all_points)

emb_all_points = get_pca(n_components=config['pca_2']['n_components'], data=emb_all_points)
emb_all_points.shape

Dimesnion after PCA (49981, 512) and it takes 65.20254111289978 seconds or 1.0867090185483297 minutes or 0.018111816975805495 hours
Dimesnion after PCA (49981, 154) and it takes 20.80364203453064 seconds or 0.34672736724217734 minutes or 0.005778789454036289 hours


(49981, 154)

In [7]:
 from sklearn.cluster import AgglomerativeClustering

In [8]:
clf = AgglomerativeClustering(n_clusters = 175)

In [9]:
pred_val = clf.fit_predict(emb_all_points)

In [10]:
clusters = np.unique(pred_val).tolist()

In [16]:
def calculate_medoid(cluster_points):
    """
    Function to calculate the medoid of a cluster.
    """
    num_points = len(cluster_points)
    distances = np.zeros((num_points, num_points))
    
    # Calculate pairwise distances between points
    for i in range(num_points):
        for j in range(i+1, num_points):
            distances[i][j] = distances[j][i] = directed_hausdorff(cluster_points[i], cluster_points[j])[0]
    
    # Calculate total distance for each point
    total_distances = np.sum(distances, axis=0)
    
    # Find index of point with minimum total distance
    medoid_index = np.argmin(total_distances)
    
    return cluster_points[medoid_index]


In [17]:
processes = cpu_count()
chunk_size = len(clusters)//processes + 1
chunks = [clusters[i:i+chunk_size] for i in range(0,len(clusters),chunk_size)]
chunks

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
 [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],
 [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43],
 [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54],
 [55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],
 [66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76],
 [77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87],
 [88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98],
 [99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109],
 [110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120],
 [121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131],
 [132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142],
 [143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153],
 [154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164],
 [165, 166, 167, 168, 169, 170, 171, 172, 173, 174]]

In [18]:
def calc_medoid(clusters,all_points,pred_val):
    medoid_dict = {}
    for cluster in clusters:
        medoid_dict[cluster] = None
    for cluster in clusters:
        idx = (pred_val == cluster).nonzero()[0]
        cluster_points = np.take(all_points,idx,axis=0)
        medoid_dict[cluster] = calculate_medoid(cluster_points)
    return medoid_dict

In [19]:
g = partial(calc_medoid,all_points = all_points,pred_val=pred_val)

In [20]:
with Pool(processes) as p:
    res = p.map(g,chunks)

(0.35608198109532935, 1930, 1564)
<class 'tuple'>
(0.343623500972956, 1972, 1724)
<class 'tuple'>
(0.3803689307808596, 644, 1822)
<class 'tuple'>
(0.09958460394190234, 173, 1364)
<class 'tuple'>
(0.19961621416776737, 293, 264)
<class 'tuple'>
(0.43905721885878596, 1106, 1517)
<class 'tuple'>
(0.42005457899640297, 1943, 1100)
<class 'tuple'>


In [None]:
res