In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN, OPTICS, KMeans
from sklearn import metrics
import hdbscan
import time
import umap
import umap.plot

from utils import prepare_dataset, calculate_all_metric

#### Input

In [3]:
df_hscode4 = prepare_dataset()
df_hscode4

Unnamed: 0,hscode4_text,HSCode2
22598,ม้า ลา และล่อมีชีวิต,01
22608,สัตว์จำพวกโคกระบือ มีชีวิต,01
22626,สุกรมีชีวิต,01
22632,แกะและแพะมีชีวิต,01
22640,สัตว์ปีกเลี้ยงมีชีวิต ได้แก่ ไก่ชนิดแกลลัสโดเม...,01
...,...,...
81044,"Original engravings, prints and lithographs. ...",97
81048,"Original sculptures and statuary, in any mater...",97
81086,"Postage or revenue stamps, stamp-postmarks, fi...",97
81088,Collections and collectors' pieces of zoologic...,97


#### Get Vector Input (X)

In [4]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [5]:
X = model.encode(df_hscode4['hscode4_text'].values, convert_to_tensor=True)

In [6]:
X.shape

torch.Size([2427, 512])

In [7]:
corpus_embeddings = X

#### Calculate Metric

In [34]:
def find_community(threshold):
    clusters = util.community_detection(corpus_embeddings, min_community_size=5, threshold=threshold)
    df_hscode4['cluster_group_id'] = -1
    
    for i, cluster in enumerate(clusters):
        df_hscode4.iloc[cluster, df_hscode4.columns.get_loc('cluster_group_id')] = i
    
    # import pdb; pdb.set_trace()
    X_input = umap.UMAP(n_components=2).fit_transform(X)
    # df_hscode4.iloc[:1000, df_hscode4.columns.get_loc('cluster_group_id')] = 1
    # print(df_hscode4)
    # df_hscode4.iloc[1000:, df_hscode4.columns.get_loc('cluster_group_id')] = 2
    dict_result, df_result = calculate_all_metric(df_hscode4.copy(), X_input)
    dict_result['threshold'] = threshold
    
    return  dict_result, df_result

In [9]:
result_list = []
for threshold in np.arange(0.15, 1.0, 0.05):    
    dict_result, df_result = find_community(threshold=threshold)
    result_list.append(dict_result)
    print(dict_result)

{'acc_mean': 0.8957830778368101, 'silhouette': -0.1776012, 'cluster_size': 33, 'n_noise': 8, 'n_top_20_cluster': array([1898,  166,   51,   44,   35,   25,   18,   18,   11,   10,    9,
          9,    8,    8,    8,    8,    7,    7,    6,    6]), 'threshold': 0.15}
{'acc_mean': 0.8374593970649103, 'silhouette': -0.28694203, 'cluster_size': 46, 'n_noise': 14, 'n_top_20_cluster': array([1597,  227,  103,   60,   46,   25,   20,   16,   15,   15,   14,
         14,   14,   12,   12,   12,   11,   11,   10,    9]), 'threshold': 0.2}
{'acc_mean': 0.7329950791520203, 'silhouette': -0.34247556, 'cluster_size': 63, 'n_noise': 28, 'n_top_20_cluster': array([1263,  226,  138,  113,   50,   37,   36,   27,   26,   26,   18,
         16,   15,   15,   14,   14,   14,   13,   13,   12]), 'threshold': 0.25}


KeyboardInterrupt: 

In [13]:
df_result_all = pd.DataFrame(result_list)
df_result_all.to_csv('commnity_detection_output.csv',index=False, sep='\t')

In [57]:
dict_result, df_result = find_community(threshold=0.6)
dict_result
# X_umap = umap.UMAP(**umap_param).fit_transform(X)

{'acc_mean': 0.328020913156763,
 'silhouette': -0.42472717,
 'cluster_size': 147,
 'n_noise': 1164,
 'n_top_20_cluster': array([42, 31, 27, 26, 26, 25, 21, 19, 19, 18, 18, 18, 18, 18, 17, 16, 16,
        15, 14, 14]),
 'threshold': 0.6}

In [60]:
mapper = umap.UMAP().fit(X)

In [65]:
df_result = df_result.reset_index().drop(columns='index')

In [55]:
# df_result['cluster_group_id'].value_counts()

In [50]:
# umap.plot.points(mapper, labels=df_result['cluster_group_id'].values)

In [67]:
p = umap.plot.interactive(
    mapper, 
    labels=df_result['cluster_group_id'].values, 
    hover_data=df_result, 
    point_size=10
)
umap.plot.show(p)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [63]:
df_result.index.value_counts()

22598    2
56993    2
59672    2
58823    2
57875    2
        ..
27679    1
27190    1
27181    1
29914    1
32032    1
Length: 1220, dtype: int64