In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
from sklearn import metrics
import torch
import time
import umap
import umap.plot
import time
import itertools
from line_profiler import LineProfiler

from utils import prepare_dataset, calculate_all_metric, find_spearman_pearson

import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.WARNING,  # set 3rd party logs to warning (for hiding it)
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

profile = LineProfiler()

#### Input

In [3]:
df_hscode4 = prepare_dataset()
df_hscode4

Unnamed: 0,hscode4_text,HSCode2
0,ม้า ลา และล่อมีชีวิต,01
1,สัตว์จำพวกโคกระบือ มีชีวิต,01
2,สุกรมีชีวิต,01
3,แกะและแพะมีชีวิต,01
4,สัตว์ปีกเลี้ยงมีชีวิต ได้แก่ ไก่ชนิดแกลลัสโดเม...,01
...,...,...
2422,"Original engravings, prints and lithographs. ...",97
2423,"Original sculptures and statuary, in any mater...",97
2424,"Postage or revenue stamps, stamp-postmarks, fi...",97
2425,Collections and collectors' pieces of zoologic...,97


In [4]:
df_cross = df_hscode4.merge(df_hscode4, how='cross')
df_cross.loc[ df_cross['HSCode2_x'] == df_cross['HSCode2_y'], 'score'] = 1
df_cross.loc[ df_cross['HSCode2_x'] != df_cross['HSCode2_y'], 'score'] = 0
df_cross

Unnamed: 0,hscode4_text_x,HSCode2_x,hscode4_text_y,HSCode2_y,score
0,ม้า ลา และล่อมีชีวิต,01,ม้า ลา และล่อมีชีวิต,01,1.0
1,ม้า ลา และล่อมีชีวิต,01,สัตว์จำพวกโคกระบือ มีชีวิต,01,1.0
2,ม้า ลา และล่อมีชีวิต,01,สุกรมีชีวิต,01,1.0
3,ม้า ลา และล่อมีชีวิต,01,แกะและแพะมีชีวิต,01,1.0
4,ม้า ลา และล่อมีชีวิต,01,สัตว์ปีกเลี้ยงมีชีวิต ได้แก่ ไก่ชนิดแกลลัสโดเม...,01,1.0
...,...,...,...,...,...
5890324,Antiques of an age exceeding one hundred years...,97,"Original engravings, prints and lithographs. ...",97,1.0
5890325,Antiques of an age exceeding one hundred years...,97,"Original sculptures and statuary, in any mater...",97,1.0
5890326,Antiques of an age exceeding one hundred years...,97,"Postage or revenue stamps, stamp-postmarks, fi...",97,1.0
5890327,Antiques of an age exceeding one hundred years...,97,Collections and collectors' pieces of zoologic...,97,1.0


In [5]:
T = df_hscode4['hscode4_text'].values
Y = df_hscode4['HSCode2'].values
labels = df_cross['score'].values

In [6]:
labels.shape

(5890329,)

In [66]:
# model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [67]:
# X = model.encode(T, convert_to_tensor=True)

In [68]:
# X.shape

In [65]:
# cos_sim = util.cos_sim(X, X)
# dot_products = util.dot_score(X,X)

# cosine_scores = torch.reshape(cos_sim, (labels.shape[0], ))
# dot_products = torch.reshape(dot_products, (labels.shape[0], ))

In [7]:
parameters = {
    'distiluse-base-multilingual-cased-v2': {
    },
    'paraphrase-multilingual-MiniLM-L12-v2': {
    },
    'paraphrase-multilingual-mpnet-base-v2': {
    },
    
    
}

model_result = {
    'distiluse-base-multilingual-cased-v2': {},
    'paraphrase-multilingual-MiniLM-L12-v2': {},
    'paraphrase-multilingual-mpnet-base-v2': {},
}

In [13]:
# @profile
def run_model(model_name):
    start_time = time.time()
    model = SentenceTransformer(model_name)
    load_model_time = time.time() - start_time
    print('   load_model_time:', load_model_time)
    
    start_time = time.time()
    X = model.encode(T, convert_to_tensor=True)
    encode_time = time.time() - start_time
    print('   encode_time:', load_model_time)
    
    start_time = time.time()
    result = find_spearman_pearson(X, labels)
    spearman_pearson_time = time.time() - start_time
    
    result['load_model_time'] = load_model_time
    result['encode_time'] = encode_time
    result['spearman_pearson_time'] = spearman_pearson_time
    
    # with open(f'profiling/profiling_{model_name}.txt', 'w') as stream:
    #     profile.print_stats(stream=stream)
    
    return result 
    

In [14]:
for key, value in parameters.items():
    print(f'--- {key} ---')
    result = run_model(key)   
    model_result[key] = result 
    print(result)
        


--- distiluse-base-multilingual-cased-v2 ---
   load_model_time: 9.80741024017334
   encode_time: 9.80741024017334
{'spearman_cosine': 0.12185646569416776, 'pearson_cosine': 0.18877965377804343, 'spearman_dot': 0.1135390022756929, 'pearson_dot': 0.18198868809745544, 'load_model_time': 9.80741024017334, 'encode_time': 147.58377933502197, 'spearman_pearson_time': 7.422705173492432}
--- paraphrase-multilingual-MiniLM-L12-v2 ---
   load_model_time: 1.8617308139801025
   encode_time: 1.8617308139801025
{'spearman_cosine': 0.16805728795253272, 'pearson_cosine': 0.23396830402449054, 'spearman_dot': 0.1567375596275878, 'pearson_dot': 0.22078572014800174, 'load_model_time': 1.8617308139801025, 'encode_time': 66.21149754524231, 'spearman_pearson_time': 3.884265184402466}
--- paraphrase-multilingual-mpnet-base-v2 ---
   load_model_time: 6.890433073043823
   encode_time: 6.890433073043823
{'spearman_cosine': 0.17721252569385856, 'pearson_cosine': 0.2390359997085682, 'spearman_dot': 0.1691917671565

In [15]:
model_result_temp = model_result.copy()

In [16]:
model_result_temp

{'distiluse-base-multilingual-cased-v2': {'spearman_cosine': 0.12185646569416776,
  'pearson_cosine': 0.18877965377804343,
  'spearman_dot': 0.1135390022756929,
  'pearson_dot': 0.18198868809745544,
  'load_model_time': 9.80741024017334,
  'encode_time': 147.58377933502197,
  'spearman_pearson_time': 7.422705173492432},
 'paraphrase-multilingual-MiniLM-L12-v2': {'spearman_cosine': 0.16805728795253272,
  'pearson_cosine': 0.23396830402449054,
  'spearman_dot': 0.1567375596275878,
  'pearson_dot': 0.22078572014800174,
  'load_model_time': 1.8617308139801025,
  'encode_time': 66.21149754524231,
  'spearman_pearson_time': 3.884265184402466},
 'paraphrase-multilingual-mpnet-base-v2': {'spearman_cosine': 0.17721252569385856,
  'pearson_cosine': 0.2390359997085682,
  'spearman_dot': 0.16919176715652487,
  'pearson_dot': 0.23484528793003517,
  'load_model_time': 6.890433073043823,
  'encode_time': 207.694682598114,
  'spearman_pearson_time': 4.681907653808594}}

In [17]:
result_list = []
for key, value in model_result_temp.items():
    model_result_temp[key]['model_name'] = key
    result_list.append(model_result_temp[key])
    
df_result_all = pd.DataFrame(result_list)
df_result_all.to_csv('spearman_pearson_output.csv',index=False, sep='\t')

In [92]:
model_result

{'distiluse-base-multilingual-cased-v2': {'model_name': 'distiluse-base-multilingual-cased-v2'},
 'paraphrase-multilingual-MiniLM-L12-v2': {'model_name': 'paraphrase-multilingual-MiniLM-L12-v2'},
 'paraphrase-multilingual-mpnet-base-v2': {'model_name': 'paraphrase-multilingual-mpnet-base-v2'}}

In [85]:
# del model_result['model_name']

#### Umap

In [26]:
umap_params = {
    'n_components': np.arange(2, 11, 1),
    'n_neighbors': [2,4,8,16,32,64],
}

parameters = {
    'distiluse-base-multilingual-cased-v2': {
    },
    'paraphrase-multilingual-MiniLM-L12-v2': {
    },
    'paraphrase-multilingual-mpnet-base-v2': {
    },   
    
}
model_result_umap = {
    'distiluse-base-multilingual-cased-v2': [],
    'paraphrase-multilingual-MiniLM-L12-v2': [],
    'paraphrase-multilingual-mpnet-base-v2': [],
}

In [27]:
def run_model_umap(X, umap_param):
    start_time = time.time()
    X_umap = umap.UMAP(**umap_param).fit_transform(X)
    umap_time = time.time() - start_time
    print('   umap_time:', umap_time)
    
    start_time = time.time()
    result = find_spearman_pearson(X_umap, labels)
    spearman_pearson_time = time.time() - start_time
    
    result['umap_time'] = umap_time
    result['spearman_pearson_time'] = spearman_pearson_time
    
    return result 

In [28]:
umap_keys = list(umap_params.keys())
umap_values = umap_params.values()
umap_all_combinations = list(itertools.product(*umap_values))


for key, value in parameters.items():
    print(f'--- {key} ---')
    start_time = time.time()
    model = SentenceTransformer(key)
    load_model_time = time.time() - start_time
    print('   load_model_time:', load_model_time)
    
    start_time = time.time()
    X = model.encode(T, convert_to_tensor=True)
    encode_time = time.time() - start_time
    print('   encode_time:', encode_time)
    
    for umap_parameter_value in umap_all_combinations:
        umap_parameter_result = {}
        for i_parameter in range(len(umap_parameter_value)):
            umap_parameter_result[umap_keys[i_parameter]] = umap_parameter_value[i_parameter]
        print('   umap_parameter:', umap_parameter_result)
        result = run_model_umap(X, umap_param=umap_parameter_result) 
        result['load_model_time'] = load_model_time
        result['encode_time'] = encode_time  
        result['umap_param'] = umap_parameter_result
        result['model_name'] = key
        model_result_umap[key].append(result) 
        print(len(model_result_umap[key]))
        print(result) 

--- distiluse-base-multilingual-cased-v2 ---
   load_model_time: 0.8553686141967773
   encode_time: 175.56898069381714
   umap_parameter: {'n_components': 2, 'n_neighbors': 2}


failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


   umap_time: 8.837761878967285
1
{'spearman_cosine': 0.035263988552731614, 'pearson_cosine': 0.02963233237752125, 'spearman_dot': 0.028810052038481485, 'pearson_dot': 0.03176504802515109, 'umap_time': 8.837761878967285, 'spearman_pearson_time': 3.3302886486053467, 'load_model_time': 0.8553686141967773, 'encode_time': 175.56898069381714, 'umap_param': {'n_components': 2, 'n_neighbors': 2}, 'model_name': 'distiluse-base-multilingual-cased-v2'}
   umap_parameter: {'n_components': 2, 'n_neighbors': 4}
   umap_time: 10.313378095626831
2
{'spearman_cosine': 0.10343262525275582, 'pearson_cosine': 0.060565530957452623, 'spearman_dot': 0.07297796644066232, 'pearson_dot': 0.07675776527408584, 'umap_time': 10.313378095626831, 'spearman_pearson_time': 4.079493761062622, 'load_model_time': 0.8553686141967773, 'encode_time': 175.56898069381714, 'umap_param': {'n_components': 2, 'n_neighbors': 4}, 'model_name': 'distiluse-base-multilingual-cased-v2'}
   umap_parameter: {'n_components': 2, 'n_neighbo

In [None]:
model_result_umap

{'distiluse-base-multilingual-cased-v2': {'spearman_cosine': 0.14000025407279698,
  'pearson_cosine': 0.12298439953045988,
  'spearman_dot': 0.04318345918506397,
  'pearson_dot': 0.052250197479265015,
  'umap_time': 14.281391143798828,
  'spearman_pearson_time': 6.261852264404297,
  'load_model_time': 1.1425349712371826,
  'encode_time': 149.15838837623596,
  'umap_param': {'n_components': 10, 'n_neighbors': 64},
  'model_name': 'distiluse-base-multilingual-cased-v2'},
 'paraphrase-multilingual-MiniLM-L12-v2': {'spearman_cosine': 0.18836090726677582,
  'pearson_cosine': 0.15108449176869404,
  'spearman_dot': 0.019051820501864523,
  'pearson_dot': 0.025953807966455193,
  'umap_time': 15.400129318237305,
  'spearman_pearson_time': 4.561873912811279,
  'load_model_time': 2.5872137546539307,
  'encode_time': 105.56571006774902,
  'umap_param': {'n_components': 10, 'n_neighbors': 64},
  'model_name': 'paraphrase-multilingual-MiniLM-L12-v2'},
 'paraphrase-multilingual-mpnet-base-v2': {'spear

In [None]:
model_result_umap_temp = model_result_umap.copy()

In [None]:
result_list = []
for key, value in model_result_umap_temp.items():
    # model_result_umap_temp[key]['model_name'] = key
    result_list.extend(model_result_umap_temp[key])
    
df_result_all = pd.DataFrame(result_list)
df_result_all.to_csv('spearman_pearson_umap_output.csv',index=False, sep='\t')

In [None]:
--- distiluse-base-multilingual-cased-v2 ---
   load_model_time: 10.39717721939087
   encode_time: 149.7373948097229

In [64]:
# eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
# eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

# eval_pearson_dot, _ = pearsonr(labels, dot_products)
# eval_spearman_dot, _ = spearmanr(labels, dot_products)

In [None]:
# cosine_scores = 1 - (paired_cosine_distances(X, X))
# manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
# euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
# dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]


# eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
# eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

# eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
# eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

# eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
# eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

# eval_pearson_dot, _ = pearsonr(labels, dot_products)
# eval_spearman_dot, _ = spearmanr(labels, dot_products)