In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN, OPTICS, KMeans
from sklearn import metrics
import hdbscan
import time
import umap
import umap.plot
import itertools

from utils import prepare_dataset, calculate_all_metric

import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.WARNING,  # set 3rd party logs to warning (for hiding it)
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

#### Input

In [5]:
df_hscode4 = prepare_dataset()
df_hscode4

Unnamed: 0,hscode4_text,HSCode2
22598,ม้า ลา และล่อมีชีวิต,01
22608,สัตว์จำพวกโคกระบือ มีชีวิต,01
22626,สุกรมีชีวิต,01
22632,แกะและแพะมีชีวิต,01
22640,สัตว์ปีกเลี้ยงมีชีวิต ได้แก่ ไก่ชนิดแกลลัสโดเม...,01
...,...,...
81044,"Original engravings, prints and lithographs. ...",97
81048,"Original sculptures and statuary, in any mater...",97
81086,"Postage or revenue stamps, stamp-postmarks, fi...",97
81088,Collections and collectors' pieces of zoologic...,97


#### Get Vector Input (X)

In [6]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [7]:
X = model.encode(df_hscode4['hscode4_text'].values, convert_to_tensor=True)

In [8]:
X.shape

torch.Size([2427, 512])

In [9]:
corpus_embeddings = X

#### Calculate Metric

In [36]:
parameters = {
    'HDBSCAN': {
        'umap': {
            'n_neighbors': [5], 
            # 'n_components': np.arange(2, 6, 1),
            'n_components': [3],
        },
        'clustering': {
            'alpha': [1.0], 
            'min_cluster_size': np.arange(3, 20, 2),
            # 'n_estimators': np.linspace(1, 100, num=3, dtype='int'),
            # 'contamination': np.linspace(0.01, 0.5, num=10),  
        },
        "use_dim_reduction": True
    },
    'KMean': {
        'umap': {
            # 'n_neighbors': [5], 
            # # 'n_components': np.arange(2, 6, 1),
            'n_components': [3, 4 , 5],
        },
        'clustering': {
            'n_clusters': [70, 80, 90, 100],
            'random_state': [0],
        },
        "use_dim_reduction": True
        # 'n_init': "auto",
        # n_clusters=6, random_state=0, n_init="auto"
    },
    
    
}

model_classes = {
    'HDBSCAN': hdbscan.HDBSCAN,
    'KMean': KMeans
}

model_result = {
    'HDBSCAN': [],
    'KMean': []
}

def get_model_result(model_name):
    logger.info(f"{model_name}")               
    model_detail = {}    
        
    umap_keys = list(parameters[model_name]['umap'].keys())
    umap_values = parameters[model_name]['umap'].values()
    umap_all_combinations = list(itertools.product(*umap_values))
    
    clustering_keys = list(parameters[model_name]['clustering'].keys())
    clustering_values = parameters[model_name]['clustering'].values()
    clustering_all_combinations = list(itertools.product(*clustering_values))
    
    model_result[model_name] = []
    
    for umap_parameter_value in umap_all_combinations:
        umap_parameter_result = {}
        for i_parameter in range(len(umap_parameter_value)):
            umap_parameter_result[umap_keys[i_parameter]] = umap_parameter_value[i_parameter]
            
        logger.info(f'         umap parameter_result {umap_parameter_result}')   
        
        
        for clustering_parameter_value in clustering_all_combinations:
            clustering_parameter_result = {}
            for i_parameter in range(len(clustering_parameter_value)):
                clustering_parameter_result[clustering_keys[i_parameter]] = clustering_parameter_value[i_parameter]
                
            logger.info(f'          clustering parameter_result {clustering_parameter_result}')
         
            # model =  model_classes[model_name](**parameter_result)     
            dict_result, df_result = find_cluster(
                umap_parameter_result, 
                clustering_parameter_result, 
                parameters[model_name]['use_dim_reduction'],
                model_name
            )  
            model_result[model_name].append(dict_result) 
    
    return model_result[model_name]
            
            
    

In [37]:
def find_cluster(umap_param, clustering_param, use_dim_reduction, model_name):    
    if use_dim_reduction:
        X_input = umap.UMAP(**umap_param).fit_transform(X)
    else:
        X_input = X
    clusterer = model_classes[model_name](**clustering_param)
    hdbscan_cluster = clusterer.fit_predict(X_input)
    df_hscode4['cluster_group_id'] = hdbscan_cluster
    
    dict_result, df_result = calculate_all_metric(df_hscode4, X_input)
    dict_result['umap_param'] = umap_param
    dict_result['clustering_param'] = clustering_param
    logger.info(f'               result {dict_result}')
    return  dict_result, df_result

In [43]:
result_list = get_model_result('HDBSCAN')

2023-01-23 11:10:06 INFO     HDBSCAN
2023-01-23 11:10:06 INFO              umap parameter_result {'n_neighbors': 5, 'n_components': 3}
2023-01-23 11:10:06 INFO               clustering parameter_result {'alpha': 1.0, 'min_cluster_size': 3}
2023-01-23 11:10:14 INFO                    result {'acc_mean': 0.4894858161568538, 'silhouette': 0.34601808, 'cluster_size': 232, 'n_noise': 457, 'n_top_20_cluster': array([61, 42, 23, 23, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16,
       16, 15, 14]), 'umap_param': {'n_neighbors': 5, 'n_components': 3}, 'clustering_param': {'alpha': 1.0, 'min_cluster_size': 3}}
2023-01-23 11:10:14 INFO               clustering parameter_result {'alpha': 1.0, 'min_cluster_size': 5}
2023-01-23 11:10:21 INFO                    result {'acc_mean': 0.5097491603189267, 'silhouette': 0.28638348, 'cluster_size': 131, 'n_noise': 589, 'n_top_20_cluster': array([80, 55, 47, 46, 41, 31, 29, 29, 28, 26, 25, 24, 23, 23, 23, 23, 23,
       22, 22, 21]), 'umap_param': {'n

In [38]:
result_list = get_model_result('KMean')

2023-01-23 11:55:56 INFO     KMean
2023-01-23 11:55:56 INFO              umap parameter_result {'n_components': 3}
2023-01-23 11:55:56 INFO               clustering parameter_result {'n_clusters': 70, 'random_state': 0}
2023-01-23 11:56:08 INFO                    result {'acc_mean': 0.623331200211498, 'silhouette': 0.43995208, 'cluster_size': 70, 'n_noise': 0, 'n_top_20_cluster': array([84, 80, 72, 68, 65, 60, 58, 55, 54, 54, 52, 51, 51, 48, 48, 47, 42,
       42, 41, 40]), 'umap_param': {'n_components': 3}, 'clustering_param': {'n_clusters': 70, 'random_state': 0}}
2023-01-23 11:56:08 INFO               clustering parameter_result {'n_clusters': 80, 'random_state': 0}
2023-01-23 11:56:18 INFO                    result {'acc_mean': 0.634987875672429, 'silhouette': 0.41548124, 'cluster_size': 80, 'n_noise': 0, 'n_top_20_cluster': array([66, 63, 61, 52, 52, 49, 47, 47, 46, 46, 42, 41, 40, 40, 40, 39, 39,
       39, 38, 38]), 'umap_param': {'n_components': 3}, 'clustering_param': {'n_clus

In [27]:
result_list

[{'acc_mean': 0.5938045472603336,
  'silhouette': 0.4241444,
  'cluster_size': 90,
  'n_noise': 0,
  'n_top_20_cluster': array([75, 62, 61, 52, 52, 46, 41, 39, 39, 38, 37, 36, 35, 35, 35, 35, 35,
         35, 35, 34]),
  'umap_param': {'n_components': 3},
  'clustering_param': {'n_clusters': 90, 'random_state': 0}}]

In [44]:
# result_list = []
# for threshold in np.arange(0.15, 1.0, 0.05):    
#     dict_result, df_result = find_community(threshold=threshold)
#     result_list.append(dict_result)
#     print(dict_result)

In [45]:
df_result_all = pd.DataFrame(result_list)
df_result_all.to_csv('HDBSCAN_output.csv',index=False, sep='\t')

In [None]:
df_result_all = pd.DataFrame(result_list)
df_result_all.to_csv('KMean_output.csv',index=False, sep='\t')

In [None]:
# dict_result, df_result = find_community(threshold=0.5)