In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN, OPTICS, KMeans
from sklearn import metrics
import hdbscan
import time
import umap
import umap.plot
import itertools

from utils import prepare_dataset, calculate_all_metric

import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.WARNING,  # set 3rd party logs to warning (for hiding it)
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

#### Input

In [3]:
df_hscode4 = prepare_dataset()
df_hscode4

Unnamed: 0,hscode4_text,HSCode2
0,ม้า ลา และล่อมีชีวิต,01
1,สัตว์จำพวกโคกระบือ มีชีวิต,01
2,สุกรมีชีวิต,01
3,แกะและแพะมีชีวิต,01
4,สัตว์ปีกเลี้ยงมีชีวิต ได้แก่ ไก่ชนิดแกลลัสโดเม...,01
...,...,...
2422,"Original engravings, prints and lithographs. ...",97
2423,"Original sculptures and statuary, in any mater...",97
2424,"Postage or revenue stamps, stamp-postmarks, fi...",97
2425,Collections and collectors' pieces of zoologic...,97


In [5]:
df_hscode4['HSCode2'].value_counts()

84    170
28     93
85     92
29     83
90     64
     ... 
66      6
24      6
46      4
14      4
13      4
Name: HSCode2, Length: 96, dtype: int64

In [6]:
df_hscode4[df_hscode4['HSCode2'] == '84'] 

Unnamed: 0,hscode4_text,HSCode2
954,เครื่องปฎิกรณ์นิวเคลียร์ แท่งเชื้อเพลิง (คาร์ท...,84
955,บอยเลอร์กำเนิดไอน้ำหรือไออื่น ๆ (นอกจากบอยเลอร...,84
956,บอยเลอร์สำหรับการทำความร้อนจากส่วนกลาง นอกจาก...,84
957,เครื่องจักรโรงงานที่เป็นเครื่องช่วยสำหรับใช้งา...,84
958,เครื่องกำเนิดโพรดิวเซอร์ก๊าซหรือวอเตอร์ก๊าซ มี...,84
...,...,...
2242,Ball or roller bearings.,84
2243,Transmission shafts (including cam shafts and ...,84
2244,Gaskets and similar joints of metal sheeting c...,84
2245,Machines and apparatus of a kind used solely o...,84


#### Get Vector Input (X)

In [4]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [5]:
X = model.encode(df_hscode4['hscode4_text'].values, convert_to_tensor=True)

In [6]:
X.shape

torch.Size([2427, 512])

In [7]:
corpus_embeddings = X

#### Calculate Metric

In [47]:
parameters = {
    'HDBSCAN': {
        'umap': {
            # 'n_neighbors': [5], 
            # 'n_components': np.arange(2, 6, 1),
            'n_components': [3],
        },
        'clustering': {
            'alpha': [1.0], 
            'min_cluster_size': [5],
            # 'min_cluster_size': np.arange(3, 20, 2),
            # 'n_estimators': np.linspace(1, 100, num=3, dtype='int'),
            # 'contamination': np.linspace(0.01, 0.5, num=10),  
        },
        "use_dim_reduction": True
    },
    'KMean': {
        'umap': {
            # 'n_neighbors': [5], 
            # # 'n_components': np.arange(2, 6, 1),
            'n_components': [3],
        },
        'clustering': {
            'n_clusters': [90],
            'random_state': [0],
        },
        "use_dim_reduction": True
        # 'n_init': "auto",
        # n_clusters=6, random_state=0, n_init="auto"
    },
    'DBSCAN': {
        'umap': {
            'n_components': [3],
        },
        'clustering': {
            'eps': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 
            'min_samples': [2],
        },
        "use_dim_reduction": True
    },
    
    
}

model_classes = {
    'HDBSCAN': hdbscan.HDBSCAN,
    'KMean': KMeans,
    'DBSCAN': DBSCAN
}

model_result = {
    'HDBSCAN': [],
    'KMean': [],
    'DBSCAN': [],
}

def get_model_result(model_name):
    logger.info(f"{model_name}")               
    model_detail = {}    
        
    umap_keys = list(parameters[model_name]['umap'].keys())
    umap_values = parameters[model_name]['umap'].values()
    umap_all_combinations = list(itertools.product(*umap_values))
    
    clustering_keys = list(parameters[model_name]['clustering'].keys())
    clustering_values = parameters[model_name]['clustering'].values()
    clustering_all_combinations = list(itertools.product(*clustering_values))
    
    model_result[model_name] = []
    
    for umap_parameter_value in umap_all_combinations:
        umap_parameter_result = {}
        for i_parameter in range(len(umap_parameter_value)):
            umap_parameter_result[umap_keys[i_parameter]] = umap_parameter_value[i_parameter]
            
        logger.info(f'         umap parameter_result {umap_parameter_result}')   
        
        
        for clustering_parameter_value in clustering_all_combinations:
            clustering_parameter_result = {}
            for i_parameter in range(len(clustering_parameter_value)):
                clustering_parameter_result[clustering_keys[i_parameter]] = clustering_parameter_value[i_parameter]
                
            logger.info(f'          clustering parameter_result {clustering_parameter_result}')
         
            # model =  model_classes[model_name](**parameter_result)     
            dict_result, df_result = find_cluster(
                umap_parameter_result, 
                clustering_parameter_result, 
                parameters[model_name]['use_dim_reduction'],
                model_name
            )  
            model_result[model_name].append(dict_result) 
    
    return model_result[model_name], df_result
            
            
    

In [22]:
def find_cluster(umap_param, clustering_param, use_dim_reduction, model_name):    
    if use_dim_reduction:
        X_input = umap.UMAP(**umap_param).fit_transform(X)
    else:
        X_input = X
    clusterer = model_classes[model_name](**clustering_param)
    hdbscan_cluster = clusterer.fit_predict(X_input)
    df_hscode4['cluster_group_id'] = hdbscan_cluster
    
    dict_result, df_result = calculate_all_metric(df_hscode4, X_input)
    dict_result['umap_param'] = umap_param
    dict_result['clustering_param'] = clustering_param
    logger.info(f'               result {dict_result}')
    return  dict_result, df_result

In [30]:
result_list, df_result  = get_model_result('HDBSCAN')

2023-01-23 14:41:26 INFO     HDBSCAN
2023-01-23 14:41:26 INFO              umap parameter_result {'n_components': 3}
2023-01-23 14:41:26 INFO               clustering parameter_result {'alpha': 1.0, 'min_cluster_size': 5}
2023-01-23 14:41:35 INFO                    result {'acc_mean': 0.5141121437568484, 'silhouette': 0.15757981, 'cluster_size': 98, 'n_noise': 752, 'n_top_20_cluster': array([137,  82,  77,  58,  54,  45,  37,  35,  29,  29,  29,  28,  23,
        23,  23,  22,  21,  20,  20,  20]), 'umap_param': {'n_components': 3}, 'clustering_param': {'alpha': 1.0, 'min_cluster_size': 5}}


In [13]:
result_list, df_result = get_model_result('KMean')

2023-01-23 14:29:36 INFO     KMean
2023-01-23 14:29:36 INFO              umap parameter_result {'n_components': 3}
2023-01-23 14:29:36 INFO               clustering parameter_result {'n_clusters': 90, 'random_state': 0}
2023-01-23 14:29:44 INFO                    result {'acc_mean': 0.6218484942582964, 'silhouette': 0.43212065, 'cluster_size': 90, 'n_noise': 0, 'n_top_20_cluster': array([74, 73, 67, 63, 62, 59, 45, 44, 41, 40, 38, 38, 37, 37, 36, 35, 34,
       33, 33, 33]), 'umap_param': {'n_components': 3}, 'clustering_param': {'n_clusters': 90, 'random_state': 0}}


In [49]:
# result_list, df_result = get_model_result('DBSCAN')

In [15]:
mapper = umap.UMAP().fit(X)

In [18]:
# umap.plot.points(mapper, labels=df_result['cluster_group_id'].values)

In [31]:
p = umap.plot.interactive(
    mapper, 
    labels=df_result['cluster_group_id'].values, 
    hover_data=df_result, 
    point_size=10
)
umap.plot.show(p)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
p = umap.plot.interactive(
    mapper, 
    labels=df_result['HSCode2'].values, 
    hover_data=df_result, 
    point_size=10
)
umap.plot.show(p)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
df_result

Unnamed: 0,hscode4_text,HSCode2,cluster_group_id
0,ม้า ลา และล่อมีชีวิต,01,33
1,สัตว์จำพวกโคกระบือ มีชีวิต,01,5
2,สุกรมีชีวิต,01,5
3,แกะและแพะมีชีวิต,01,33
4,สัตว์ปีกเลี้ยงมีชีวิต ได้แก่ ไก่ชนิดแกลลัสโดเม...,01,12
...,...,...,...
2422,"Original engravings, prints and lithographs. ...",97,33
2423,"Original sculptures and statuary, in any mater...",97,2
2424,"Postage or revenue stamps, stamp-postmarks, fi...",97,46
2425,Collections and collectors' pieces of zoologic...,97,71


In [27]:
result_list

[{'acc_mean': 0.5938045472603336,
  'silhouette': 0.4241444,
  'cluster_size': 90,
  'n_noise': 0,
  'n_top_20_cluster': array([75, 62, 61, 52, 52, 46, 41, 39, 39, 38, 37, 36, 35, 35, 35, 35, 35,
         35, 35, 34]),
  'umap_param': {'n_components': 3},
  'clustering_param': {'n_clusters': 90, 'random_state': 0}}]

In [44]:
# result_list = []
# for threshold in np.arange(0.15, 1.0, 0.05):    
#     dict_result, df_result = find_community(threshold=threshold)
#     result_list.append(dict_result)
#     print(dict_result)

In [45]:
df_result_all = pd.DataFrame(result_list)
df_result_all.to_csv('HDBSCAN_output.csv',index=False, sep='\t')

In [None]:
df_result_all = pd.DataFrame(result_list)
df_result_all.to_csv('KMean_output.csv',index=False, sep='\t')

In [None]:
p = umap.plot.interactive(
    mapper, 
    labels=df_result['cluster_group_id'].values, 
    hover_data=df_result, 
    point_size=10
)
umap.plot.show(p)

In [32]:
def find_community(threshold):
    clusters = util.community_detection(corpus_embeddings, min_community_size=5, threshold=threshold)
    df_hscode4['cluster_group_id'] = -1
    
    for i, cluster in enumerate(clusters):
        df_hscode4.iloc[cluster, df_hscode4.columns.get_loc('cluster_group_id')] = i
    
    # import pdb; pdb.set_trace()
    X_input = umap.UMAP(n_components=2).fit_transform(X)
    # df_hscode4.iloc[:1000, df_hscode4.columns.get_loc('cluster_group_id')] = 1
    # print(df_hscode4)
    # df_hscode4.iloc[1000:, df_hscode4.columns.get_loc('cluster_group_id')] = 2
    dict_result, df_result = calculate_all_metric(df_hscode4.copy(), X_input)
    dict_result['threshold'] = threshold
    
    return  dict_result, df_result

In [33]:
dict_result, df_result = find_community(threshold=0.6)

In [36]:
df_result

Unnamed: 0,hscode4_text,HSCode2,cluster_group_id
0,ม้า ลา และล่อมีชีวิต,01,-1
1,สัตว์จำพวกโคกระบือ มีชีวิต,01,71
2,สุกรมีชีวิต,01,71
3,แกะและแพะมีชีวิต,01,71
4,สัตว์ปีกเลี้ยงมีชีวิต ได้แก่ ไก่ชนิดแกลลัสโดเม...,01,-1
...,...,...,...
2422,"Original engravings, prints and lithographs. ...",97,-1
2423,"Original sculptures and statuary, in any mater...",97,-1
2424,"Postage or revenue stamps, stamp-postmarks, fi...",97,-1
2425,Collections and collectors' pieces of zoologic...,97,-1


In [34]:
p = umap.plot.interactive(
    mapper, 
    labels=df_result['cluster_group_id'].values, 
    hover_data=df_result, 
    point_size=10
)
umap.plot.show(p)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
