In [1]:
import json
import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch
from sklearn.metrics.cluster import adjusted_rand_score, rand_score, \
adjusted_mutual_info_score, normalized_mutual_info_score, fowlkes_mallows_score, \
silhouette_score, contingency_matrix

In [2]:
MAIN_PATH = "2021-09-28 Алексей Лукашин/Models-clustering/"
HIST_PATH = MAIN_PATH + "histograms_28_10_21_11_06/"
LABEL_PATH = MAIN_PATH + "labels.csv"

In [3]:
label = pd.read_csv(LABEL_PATH)

In [4]:
label

Unnamed: 0,model,type
0,b5025746-e58f-4696-837c-5b21808b7ae9,Torus
1,30a730ca-c466-4e8c-b00b-bf7b317495e2,Cone
2,7d0c7362-1e8d-4008-b444-cdf111e06ce9,Sphere
3,d58d68a4-7acb-48d8-96bd-71f1eca1affe,Sphere
4,df907d43-7c4f-4ccd-9350-0d84c41dba54,Cube
...,...,...
9995,e194b0ba-7565-4919-b108-f288dd73852d,Cone
9996,255e1024-5967-4247-9de9-b595403f6bc5,Cone
9997,4289a105-141e-4857-b1d8-831d933a29a3,Cylinder
9998,cf2364fa-4c63-40cd-8971-682e3790b689,Sphere


In [5]:
data = defaultdict(dict)
hists_files = [f for f in listdir(HIST_PATH) if isfile(join(HIST_PATH, f))]
for hist_file in tqdm(hists_files):
    with open(HIST_PATH + hist_file) as f:
        hist_data = json.load(f)
        orig_uuid = hist_data["origin_uuid"]
        if orig_uuid in label.model.values:
            data[orig_uuid]["label"] = label.loc[label['model'] == orig_uuid].type.values[0]
            for hist_part in hist_data["histogram_data"]:
                for hist_part_point_id, hist_part_point in enumerate(hist_part["data"]):
                    data[orig_uuid][f"{hist_part['type']}_{hist_part_point_id}"] = hist_part_point
        else:
            print('bad')
#         data[orig_uuid][]

  0%|          | 0/2137 [00:00<?, ?it/s]

In [6]:
for key in data.keys():
    if data[key]["label"].startswith('Torus'):
        data[key]["label"] = "Torus"

In [7]:
data = pd.DataFrame.from_dict(data).transpose()

In [8]:
data

Unnamed: 0,label,model_bounding_sphere_strict_outer_0,model_bounding_sphere_strict_outer_1,model_bounding_sphere_strict_outer_2,model_bounding_sphere_strict_outer_3,model_bounding_sphere_strict_outer_4,model_bounding_sphere_strict_outer_5,model_bounding_sphere_strict_outer_6,model_bounding_sphere_strict_outer_7,model_bounding_sphere_strict_outer_absolute_0,...,hull_bounding_sphere_concentric_sphere_118,hull_bounding_sphere_concentric_sphere_119,hull_bounding_sphere_concentric_sphere_120,hull_bounding_sphere_concentric_sphere_121,hull_bounding_sphere_concentric_sphere_122,hull_bounding_sphere_concentric_sphere_123,hull_bounding_sphere_concentric_sphere_124,hull_bounding_sphere_concentric_sphere_125,hull_bounding_sphere_concentric_sphere_126,hull_bounding_sphere_concentric_sphere_127
11c5e331-a33e-4350-b72b-cdc0df89c1ed,Cylinder,0,0,0,0.000892,0.002574,0.006149,0.008355,0.013177,0,...,0.023442,0.01692,0.013967,0.011527,0.009724,0.008249,0.006607,0.005244,0.003821,0.001848
26cd47e3-e98c-406a-92d7-6ef9ea091131,Cube,0,0,0,0.000661,0.001656,0.002031,0.002843,0.003026,0,...,0.004673,0.004336,0.00411,0.003835,0.003538,0.003306,0.003032,0.002749,0.002543,0.002066
08dec1c1-da98-4c73-87de-74d06c233555,Cylinder,0,0,0,0.000608,0.002269,0.004723,0.006885,0.007792,0,...,0.010711,0.009807,0.009141,0.00844,0.007359,0.006605,0.005677,0.004606,0.003281,0.00135
25879c76-212c-45dd-b224-44fd4f1262ac,Sphere,0,0,0,0.002479,0.006758,0.009277,0.010995,0.011166,0,...,0.008782,0.008149,0.007475,0.009619,0.010958,0.008781,0.007034,0.005715,0.011479,0.002965
06ca7fc2-475c-4a63-bb0c-5d8d8899ecba,Cube,0,0,0,0.000146,0.000772,0.001387,0.002652,0.004087,0,...,0.007241,0.006313,0.005384,0.004612,0.003781,0.003008,0.002329,0.001633,0.000983,0.000327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18bac1bb-d5b7-4fde-9723-43a4a2bdae53,Torus,0,0,0,0.001133,0.004165,0.006252,0.006877,0.007005,0,...,0.006081,0.005978,0.005739,0.0054,0.005277,0.005385,0.005791,0.005501,0.003903,0.001558
0b2018f5-4e91-4649-9d56-78fe1d14f87f,Torus,0,0,0,0.000066,0.001044,0.00173,0.002007,0.001937,0,...,0.004559,0.004739,0.00408,0.003979,0.003672,0.003636,0.003237,0.002517,0.001572,0.000602
094917b7-de47-4741-905b-55784f709637,Cube,0,0,0,0.000139,0.001765,0.003317,0.006113,0.012188,0,...,0.022158,0.018745,0.013827,0.0105,0.00809,0.006263,0.004598,0.003176,0.00178,0.000587
26188779-103e-43e1-8f5b-d78b36d71b26,Torus,0,0,0,0.001056,0.004736,0.006201,0.007577,0.007334,0,...,0.005992,0.005916,0.005801,0.005407,0.005242,0.005208,0.005782,0.005131,0.003663,0.001566


In [9]:
data[pd.isnull(data.label)]

Unnamed: 0,label,model_bounding_sphere_strict_outer_0,model_bounding_sphere_strict_outer_1,model_bounding_sphere_strict_outer_2,model_bounding_sphere_strict_outer_3,model_bounding_sphere_strict_outer_4,model_bounding_sphere_strict_outer_5,model_bounding_sphere_strict_outer_6,model_bounding_sphere_strict_outer_7,model_bounding_sphere_strict_outer_absolute_0,...,hull_bounding_sphere_concentric_sphere_118,hull_bounding_sphere_concentric_sphere_119,hull_bounding_sphere_concentric_sphere_120,hull_bounding_sphere_concentric_sphere_121,hull_bounding_sphere_concentric_sphere_122,hull_bounding_sphere_concentric_sphere_123,hull_bounding_sphere_concentric_sphere_124,hull_bounding_sphere_concentric_sphere_125,hull_bounding_sphere_concentric_sphere_126,hull_bounding_sphere_concentric_sphere_127


In [10]:
label_set = set(data.label.values)
label_set

{'Cone', 'Cube', 'Cylinder', 'Sphere', 'Torus'}

In [11]:
data_int_labels = np.unique(data.label.values, return_inverse=True)[1]

### Clustering evaluation

In [12]:
def metric_to_sign(value):
    if value < 0.4:
        return "-"
    elif value >= 0.4 and value < 0.6:
        return "~"
    elif value >= 0.6 and value < 1:
        return "+"
    else:
        return "NaN"

def evaluate_clustering(X, predicted, actual):
    value = adjusted_rand_score(predicted, actual)
    print(f'{metric_to_sign(value)} adjusted_rand_score={value} (perfect is 1, poor is 0)')
    value = normalized_mutual_info_score(predicted, actual)
    print(f'{metric_to_sign(value)} normalized_mutual_info_score={value} (perfect is 1, poor is 0)')
    value = adjusted_mutual_info_score(predicted, actual)
    print(f'{metric_to_sign(value)} adjusted_mutual_info_score={value} (perfect is 1, poor is 0)')
    value = fowlkes_mallows_score(predicted, actual)
    print(f'{metric_to_sign(value)} fowlkes_mallows_score={value} (perfect is 1, poor is 0)')
    value = silhouette_score(X, predicted)
    print(f'{metric_to_sign(value)} silhouette_score={value} (perfect is 1, poor is -1)')

### KMeans

In [13]:
model = KMeans(n_clusters=5)

In [14]:
model.fit(data.drop(columns=['label']))

KMeans(n_clusters=5)

In [15]:
predictions = model.predict(data.drop(columns=['label']))

In [16]:
evaluate_clustering(data.drop(columns=['label']), predictions, data_int_labels)

- adjusted_rand_score=0.04040850015789842 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.07529506739934061 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.07301637711692109 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.25507653171357936 (perfect is 1, poor is 0)
- silhouette_score=0.22261862556541392 (perfect is 1, poor is -1)


### DBSCAN

In [17]:
eps_range = np.linspace(1, 0.0001, 30)

In [18]:
for db_eps in eps_range:
    db = DBSCAN().fit(data.drop(columns=['label']))
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    print(f'DBSCAN clusters with eps={db_eps}: n_clusters={n_clusters_} n_noise={n_noise_}')

DBSCAN clusters with eps=1.0: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.9655206896551725: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.9310413793103448: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.8965620689655173: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.8620827586206896: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.8276034482758621: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.7931241379310345: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.7586448275862069: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.7241655172413792: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.6896862068965517: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.6552068965517241: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.6207275862068965: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.586248275862069: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.5517689655172413: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.5172896551724138: n_clusters=1 n_nois

Видно что DBSCAN не в состоянии разделить данные на кластеры.
Попробуем сделать то же самое на небольшой выборке тех же данных.

In [19]:
data_fraq = data.sample(frac=0.1)

In [20]:
for db_eps in eps_range:
    db = DBSCAN().fit(data_fraq.drop(columns=['label']))
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    print(f'DBSCAN clusters with eps={db_eps}: n_clusters={n_clusters_} n_noise={n_noise_}')

DBSCAN clusters with eps=1.0: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.9655206896551725: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.9310413793103448: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.8965620689655173: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.8620827586206896: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.8276034482758621: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.7931241379310345: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.7586448275862069: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.7241655172413792: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.6896862068965517: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.6552068965517241: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.6207275862068965: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.586248275862069: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.5517689655172413: n_clusters=1 n_noise=0
DBSCAN clusters with eps=0.5172896551724138: n_clusters=1 n_nois

На меньшем объеме данных DBSCAN также не смог выделить кластеры.

### AgglomerativeClustering

In [21]:
clustering = AgglomerativeClustering(n_clusters=len(label_set)).fit(data.drop(columns=['label']))

In [22]:
set(clustering.labels_)

{0, 1, 2, 3, 4}

In [23]:
evaluate_clustering(data.drop(columns=['label']), clustering.labels_, data_int_labels)

- adjusted_rand_score=0.04945980028351741 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.08676326451600873 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.08452087072256753 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.26099516614683077 (perfect is 1, poor is 0)
- silhouette_score=0.20590008580723526 (perfect is 1, poor is -1)


На небольшой выборке:

In [24]:
data_frac = data.sample(frac=0.1)
data_frac_int_labels = np.unique(data_frac.label.values, return_inverse=True)[1]

In [25]:
clustering = AgglomerativeClustering(n_clusters=len(label_set)).fit(data_frac.drop(columns=['label']))

In [26]:
evaluate_clustering(data_frac.drop(columns=['label']), clustering.labels_, data_frac_int_labels)

- adjusted_rand_score=0.02654820174505214 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.10058657188772407 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.07481694755927598 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.277276897206451 (perfect is 1, poor is 0)
- silhouette_score=0.23224587248836134 (perfect is 1, poor is -1)


### Birch

In [27]:
thresholds = np.linspace(1, 0.0001, 30)

In [28]:
for eps in thresholds:
    predicted = Birch(n_clusters=5, threshold=eps).fit_predict(data.drop(columns=['label']))
    print(set(predicted))



{0}




{0}
{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}




{0}
{0, 1, 2, 3, 4}
{0, 1, 2, 3, 4}
{0, 1, 2, 3, 4}
{0, 1, 2, 3, 4}
{0, 1, 2, 3, 4}


Видно что надо использовать очень небольшое значение threshold.

In [29]:
thresholds = np.linspace(0.001, 0.00001, 30)

In [30]:
for eps in thresholds:
    predicted = Birch(n_clusters=5, threshold=eps).fit_predict(data.drop(columns=['label']))
    print(set(predicted))
    evaluate_clustering(data.drop(columns=['label']), predicted, data_int_labels)

{0, 1, 2, 3, 4}
- adjusted_rand_score=0.03900667558035853 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.07269227728621036 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.0703826549336204 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.26391833673052845 (perfect is 1, poor is 0)
- silhouette_score=0.21891727059563615 (perfect is 1, poor is -1)
{0, 1, 2, 3, 4}
- adjusted_rand_score=0.03900667558035853 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.07269227728621036 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.0703826549336204 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.26391833673052845 (perfect is 1, poor is 0)
- silhouette_score=0.21891727059563615 (perfect is 1, poor is -1)
{0, 1, 2, 3, 4}
- adjusted_rand_score=0.03900667558035853 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.07269227728621036 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.0703826549336204 (perfect is 1, poor is 0)
- fowlkes_mallows_score

- silhouette_score=0.21891727059563615 (perfect is 1, poor is -1)
{0, 1, 2, 3, 4}
- adjusted_rand_score=0.03900667558035853 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.07269227728621036 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.0703826549336204 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.26391833673052845 (perfect is 1, poor is 0)
- silhouette_score=0.21891727059563615 (perfect is 1, poor is -1)
{0, 1, 2, 3, 4}
- adjusted_rand_score=0.03900667558035853 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.07269227728621036 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.0703826549336204 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.26391833673052845 (perfect is 1, poor is 0)
- silhouette_score=0.21891727059563615 (perfect is 1, poor is -1)
{0, 1, 2, 3, 4}
- adjusted_rand_score=0.03900667558035853 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.07269227728621036 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.

На меньшей выборке:

In [31]:
data_frac = data.sample(frac=0.1)
data_frac_int_labels = np.unique(data_frac.label.values, return_inverse=True)[1]

In [32]:
for eps in thresholds:
    predicted = Birch(n_clusters=5, threshold=eps).fit_predict(data_frac.drop(columns=['label']))
    print(set(predicted))
    evaluate_clustering(data_frac.drop(columns=['label']), predicted, data_frac_int_labels)

{0, 1, 2, 3, 4}
- adjusted_rand_score=0.054280153648632944 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.10539088605048312 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.08083513897700342 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.2697667573646425 (perfect is 1, poor is 0)
- silhouette_score=0.19731315820147816 (perfect is 1, poor is -1)
{0, 1, 2, 3, 4}
- adjusted_rand_score=0.054280153648632944 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.10539088605048312 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.08083513897700342 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.2697667573646425 (perfect is 1, poor is 0)
- silhouette_score=0.19731315820147816 (perfect is 1, poor is -1)
{0, 1, 2, 3, 4}
- adjusted_rand_score=0.054280153648632944 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.10539088605048312 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.08083513897700342 (perfect is 1, poor is 0)
- fowlkes_mallows_s

{0, 1, 2, 3, 4}
- adjusted_rand_score=0.054280153648632944 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.10539088605048312 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.08083513897700342 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.2697667573646425 (perfect is 1, poor is 0)
- silhouette_score=0.19731315820147816 (perfect is 1, poor is -1)
{0, 1, 2, 3, 4}
- adjusted_rand_score=0.054280153648632944 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.10539088605048312 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.08083513897700342 (perfect is 1, poor is 0)
- fowlkes_mallows_score=0.2697667573646425 (perfect is 1, poor is 0)
- silhouette_score=0.19731315820147816 (perfect is 1, poor is -1)
{0, 1, 2, 3, 4}
- adjusted_rand_score=0.054280153648632944 (perfect is 1, poor is 0)
- normalized_mutual_info_score=0.10539088605048312 (perfect is 1, poor is 0)
- adjusted_mutual_info_score=0.08083513897700342 (perfect is 1, poor is 0)
- fowlkes_mallows_s

Видно что ни с какими параметрами не удалось получить разделение на правильные кластеры.

### Вывод

Таким образом можно сказать что построение диаграмм никаким образом не позволяет разделить на правильные кластеры фигуры. Скорее всего необходимо использовать алгоритмы машинного обучения с учителем.

In [33]:
!pwd

/Users/psolikov/Nextcloud/HSE/ProdStories/HW4
