In [4]:
import time
import numpy as np
import torch
torch.cuda.is_available()
from tqdm import tqdm
import yaml
from utils.parse import parse_args, load_model, get_clusters, get_pca, get_tsne, get_medoid_indices, calc_dbcv,get_hier_clusters
from dataset.dataloader import data_loader
from DBCV.DBCV_multiproc import DBCV
# from DBCV.DBCV_neighbor import DBCV
# from DBCV.DBCV import DBCV

In [2]:

config = yaml.load(open("config/config_cal_dbcv.yaml"),Loader=yaml.FullLoader)
test_data_loader = data_loader(config)
with torch.cuda.device(config['util']['gpu']):
    ema_net = load_model(config)
    all_points = []
    start = time.time()
    for batch_id, test_data in tqdm(enumerate(test_data_loader, 0), total=len(test_data_loader), smoothing=0.9,desc = 'test dataset'):
        x1_test = test_data[0]
        x1_test = x1_test.cuda()
        x1_test = x1_test.transpose(2, 1)
        emb =  ema_net(x1_test,return_embedding=config['training']['return_embedding'])
        all_points.append(emb[0].detach().cpu().numpy())
    del ema_net
    del test_data_loader
    all_points = np.concatenate(all_points,axis=0)
    end = time.time()
    np.random.seed(42)
    

Successfully load Abc with 49981 instances


test dataset: 100%|██████████| 391/391 [00:19<00:00, 20.43it/s]


In [3]:
all_points = np.take(all_points, np.random.choice(np.array(list(range(0,all_points.shape[0]))), config['dbcv']['num_points'], replace=False), axis=0, out=None, mode='raise')
    
print(f'Dimension of dataset {all_points.shape} and it takes {end-start} seconds or {(end-start)/60} minutes or {(end-start)/3600} hours')

all_points = get_pca(n_components=config['pca']['n_components'],data=all_points)

# all_points = get_pca(n_components=config['pca_2']['n_components'],data=all_points)

all_points = get_tsne(n_components=config['tsne']['n_components'], data=all_points)

Dimension of dataset (1000, 1024) and it takes 22.516611099243164 seconds or 0.3752768516540527 minutes or 0.006254614194234212 hours
Dimesnion after PCA (1000, 512) and it takes 1.5419604778289795 seconds or 0.02569934129714966 minutes or 0.0004283223549524943 hours
Dimesnion after TSNE (1000, 2) and it takes 2.966329574584961 seconds or 0.049438826243082684 minutes or 0.0008239804373847113 hours


In [6]:
def get_hier_clusters(clf, all_points:np.ndarray, original_clusters:np.ndarray, clusters:list, cluster_members:list, rel_points: np.ndarray, min_samples:int = 2, classifier = 'hdbscan',count:int=0):

    if count!=0:
        clf, pred, num_clusters = get_clusters(data = rel_points,store_centers = 'medoid', classifier=classifier,min_samples=min_samples) 
    all_medoids = clf.medoids_

    print(all_medoids.shape)
     
    if count!=0:
        cluster_pointer = num_clusters
        # No new outliers from label 2
        for i, label in enumerate(pred):
            if label==-1:
                for j, cluster in enumerate(clusters[1:]):
                    members = cluster_members[j]
                    if np.any(members==rel_points[i]):
                        for member in members:
                            idx =  np.where(all_points==member)[0][0] 
                            original_clusters[idx] = cluster_pointer
                        cluster_pointer+=1
                        break

        # reassign all members of the original clusters
        for j, point in enumerate(rel_points):
            if pred[j]!=-1:
                for i, cluster in enumerate(clusters[1:]):
                    members = cluster_members[i]
                    if np.any(members==point):                            
                        for member in members:
                            idx =  np.where(all_points==member)[0][0] 
                            original_clusters[idx] = pred[j]
                        break
        clusters = np.unique(original_clusters).tolist()
        cluster_members = []
        for cluster in clusters:
            idx = (original_clusters == cluster).nonzero()[0]
            cluster_points = np.take(all_points,idx,axis=0)
            cluster_members.append(cluster_points)
        print(f'Now number of clusters are {len(np.unique(original_clusters).tolist())}') 
        print(f'The number are outliers in level {count} are {original_clusters.tolist().count(-1)}')

    all_medoid_indices = get_medoid_indices(medoids=all_medoids,data=rel_points)

    rel_points = np.take(rel_points,all_medoid_indices,axis=0)
    print(f'Dimension of embedding of all representative  objects {rel_points.shape}')

    if count==0:
        num_clusters = len(np.unique(original_clusters).tolist())
    return rel_points, original_clusters, clusters, cluster_members, num_clusters

In [8]:
rel_points = all_points.copy()

clf, pred, num_clusters = get_clusters(data = rel_points,store_centers = 'medoid', classifier=config['results']['classifier'],min_samples=config['results']['min_samples']) 

original_clusters = pred.copy()
cluster_members = []
clusters = np.unique(original_clusters).tolist()
for cluster in clusters:
    idx = (original_clusters == cluster).nonzero()[0]
    cluster_points = np.take(all_points,idx,axis=0)
    cluster_members.append(cluster_points)

count=0

while True:
    rel_points, original_clusters, clusters, cluster_members, num_clusters = get_hier_clusters(clf = clf, all_points=all_points,rel_points=rel_points,  original_clusters=original_clusters, clusters = clusters, cluster_members=cluster_members, min_samples=config['results']['min_samples'], classifier='hdbscan',count=count)
    count+=1

    if num_clusters<=10:
        n_cluster=num_clusters
        pred = original_clusters.copy()
        break

hdbscan Clustering
The value of min_cluster_size is 2
Number of outliers 119
Number of clusters excluding outliers 300
Clustering validation dataset took 0.11532783508300781 seconds
(300, 2)


  0%|          | 0/300 [00:00<?, ?it/s]

100%|██████████| 300/300 [00:00<00:00, 21559.00it/s]

Dimension of embedding of all representative  objects (300, 2)
hdbscan Clustering
The value of min_cluster_size is 2
Number of outliers 51
Number of clusters excluding outliers 71
Clustering validation dataset took 0.05086374282836914 seconds
(71, 2)





Now number of clusters are 123
The number are outliers in level 1 are 119


100%|██████████| 71/71 [00:00<00:00, 23185.58it/s]


Dimension of embedding of all representative  objects (71, 2)
hdbscan Clustering
The value of min_cluster_size is 2
Number of outliers 7
Number of clusters excluding outliers 18
Clustering validation dataset took 0.03547978401184082 seconds
(18, 2)
Now number of clusters are 77
The number are outliers in level 2 are 119


100%|██████████| 18/18 [00:00<00:00, 27393.86it/s]


Dimension of embedding of all representative  objects (18, 2)
hdbscan Clustering
The value of min_cluster_size is 2
Number of outliers 0
Number of clusters excluding outliers 4
Clustering validation dataset took 0.025417089462280273 seconds
(4, 2)
Now number of clusters are 63
The number are outliers in level 3 are 119


100%|██████████| 4/4 [00:00<00:00, 25458.60it/s]

Dimension of embedding of all representative  objects (4, 2)



