In [2]:
%load_ext autoreload
%autoreload 2

In [18]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
import time
import umap
import umap.plot
import time
import itertools

from utils import prepare_dataset_en, find_spearman_pearson

import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.WARNING,  # set 3rd party logs to warning (for hiding it)
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

#### Input

In [52]:
df_hscode4 = prepare_dataset_en()
df_hscode4

Unnamed: 0,HSCode2,hscode4_text
0,01,"Live horses, donkeys and mules"
1,01,Bovine animals live
2,01,live pig
3,01,Live sheep and goats
4,01,"Live poultry include gallus domesticus, ducks,..."
...,...,...
2422,97,"Original engravings, prints and lithographs."
2423,97,"Original sculptures and statuary, in any mater..."
2424,97,"Postage or revenue stamps, stamp-postmarks, fi..."
2425,97,Collections and collectors' pieces of zoologic...


In [53]:
df_cross = df_hscode4.merge(df_hscode4, how='cross')
df_cross.loc[ df_cross['HSCode2_x'] == df_cross['HSCode2_y'], 'score'] = 1
df_cross.loc[ df_cross['HSCode2_x'] != df_cross['HSCode2_y'], 'score'] = 0
df_cross

Unnamed: 0,HSCode2_x,hscode4_text_x,HSCode2_y,hscode4_text_y,score
0,01,"Live horses, donkeys and mules",01,"Live horses, donkeys and mules",1.0
1,01,"Live horses, donkeys and mules",01,Bovine animals live,1.0
2,01,"Live horses, donkeys and mules",01,live pig,1.0
3,01,"Live horses, donkeys and mules",01,Live sheep and goats,1.0
4,01,"Live horses, donkeys and mules",01,"Live poultry include gallus domesticus, ducks,...",1.0
...,...,...,...,...,...
5890324,97,Antiques of an age exceeding one hundred years.,97,"Original engravings, prints and lithographs.",1.0
5890325,97,Antiques of an age exceeding one hundred years.,97,"Original sculptures and statuary, in any mater...",1.0
5890326,97,Antiques of an age exceeding one hundred years.,97,"Postage or revenue stamps, stamp-postmarks, fi...",1.0
5890327,97,Antiques of an age exceeding one hundred years.,97,Collections and collectors' pieces of zoologic...,1.0


In [54]:
T = df_hscode4['hscode4_text'].values
Y = df_hscode4['HSCode2'].values
labels = df_cross['score'].values

In [55]:
labels.shape

(5890329,)

In [56]:
parameters = {
    'distiluse-base-multilingual-cased-v2': {
    },
    'paraphrase-multilingual-MiniLM-L12-v2': {
    },
    'paraphrase-multilingual-mpnet-base-v2': {
    },
    
    
}

model_result = {
    'distiluse-base-multilingual-cased-v2': {},
    'paraphrase-multilingual-MiniLM-L12-v2': {},
    'paraphrase-multilingual-mpnet-base-v2': {},
}

In [57]:
def run_model(model_name):
    start_time = time.time()
    model = SentenceTransformer(model_name)
    load_model_time = time.time() - start_time
    print('   load_model_time:', load_model_time)
    
    start_time = time.time()
    X = model.encode(T, convert_to_tensor=True)
    encode_time = time.time() - start_time
    print('   encode_time:', load_model_time)
    
    start_time = time.time()
    result = find_spearman_pearson(X, labels)
    spearman_pearson_time = time.time() - start_time
    
    result['load_model_time'] = load_model_time
    result['encode_time'] = encode_time
    result['spearman_pearson_time'] = spearman_pearson_time
    
    return result 
    

In [58]:
for key, value in parameters.items():
    print(f'--- {key} ---')
    result = run_model(key)   
    model_result[key] = result 
    print(result)

--- distiluse-base-multilingual-cased-v2 ---
   load_model_time: 1.6472022533416748
   encode_time: 1.6472022533416748
{'spearman_cosine': 0.14813543998884737, 'pearson_cosine': 0.2383370164302456, 'spearman_dot': 0.14719805177134232, 'pearson_dot': 0.23819834775181092, 'load_model_time': 1.6472022533416748, 'encode_time': 108.6062114238739, 'spearman_pearson_time': 5.742818117141724}
--- paraphrase-multilingual-MiniLM-L12-v2 ---
   load_model_time: 4.832189083099365
   encode_time: 4.832189083099365
{'spearman_cosine': 0.1753432462244915, 'pearson_cosine': 0.257245632825537, 'spearman_dot': 0.1646286559447913, 'pearson_dot': 0.23856710758810684, 'load_model_time': 4.832189083099365, 'encode_time': 67.03611612319946, 'spearman_pearson_time': 9.78647494316101}
--- paraphrase-multilingual-mpnet-base-v2 ---
   load_model_time: 30.25308322906494
   encode_time: 30.25308322906494
{'spearman_cosine': 0.18293111986707342, 'pearson_cosine': 0.2640745827270426, 'spearman_dot': 0.178823415104757

In [59]:
model_result_temp = model_result.copy()

In [60]:
result_list = []
for key, value in model_result_temp.items():
    model_result_temp[key]['model_name'] = key
    result_list.append(model_result_temp[key])
    
df_result_all = pd.DataFrame(result_list)
df_result_all.to_csv('spearman_pearson_output_en.csv',index=False, sep='\t')

In [92]:
model_result

{'distiluse-base-multilingual-cased-v2': {'model_name': 'distiluse-base-multilingual-cased-v2'},
 'paraphrase-multilingual-MiniLM-L12-v2': {'model_name': 'paraphrase-multilingual-MiniLM-L12-v2'},
 'paraphrase-multilingual-mpnet-base-v2': {'model_name': 'paraphrase-multilingual-mpnet-base-v2'}}

#### Umap

In [64]:
umap_params = {
    'n_components': np.arange(2, 11, 1),
    'n_neighbors': [2,4,8,16,32,64],
}

parameters = {
    # 'distiluse-base-multilingual-cased-v2': {
    # },
    'paraphrase-multilingual-MiniLM-L12-v2': {
    },
    # 'paraphrase-multilingual-mpnet-base-v2': {
    # },   
    
}
model_result_umap = {
    'distiluse-base-multilingual-cased-v2': [],
    'paraphrase-multilingual-MiniLM-L12-v2': [],
    'paraphrase-multilingual-mpnet-base-v2': [],
}

In [65]:
def run_model_umap(X, umap_param):
    start_time = time.time()
    X_umap = umap.UMAP(**umap_param).fit_transform(X)
    umap_time = time.time() - start_time
    print('   umap_time:', umap_time)
    
    start_time = time.time()
    result = find_spearman_pearson(X_umap, labels)
    spearman_pearson_time = time.time() - start_time
    
    result['umap_time'] = umap_time
    result['spearman_pearson_time'] = spearman_pearson_time
    
    return result 

In [66]:
umap_keys = list(umap_params.keys())
umap_values = umap_params.values()
umap_all_combinations = list(itertools.product(*umap_values))


for key, value in parameters.items():
    print(f'--- {key} ---')
    start_time = time.time()
    model = SentenceTransformer(key)
    load_model_time = time.time() - start_time
    print('   load_model_time:', load_model_time)
    
    start_time = time.time()
    X = model.encode(T, convert_to_tensor=True)
    encode_time = time.time() - start_time
    print('   encode_time:', encode_time)
    
    for umap_parameter_value in umap_all_combinations:
        umap_parameter_result = {}
        for i_parameter in range(len(umap_parameter_value)):
            umap_parameter_result[umap_keys[i_parameter]] = umap_parameter_value[i_parameter]
        print('   umap_parameter:', umap_parameter_result)
        result = run_model_umap(X, umap_param=umap_parameter_result) 
        result['load_model_time'] = load_model_time
        result['encode_time'] = encode_time  
        result['umap_param'] = umap_parameter_result
        result['model_name'] = key
        model_result_umap[key].append(result) 
        print(len(model_result_umap[key]))
        print(result) 

--- paraphrase-multilingual-MiniLM-L12-v2 ---
   load_model_time: 9.289819955825806
   encode_time: 65.91780662536621
   umap_parameter: {'n_components': 2, 'n_neighbors': 2}


failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


   umap_time: 15.282814979553223
1
{'spearman_cosine': 0.03258369417864273, 'pearson_cosine': 0.029536174354480502, 'spearman_dot': 0.031962167474619906, 'pearson_dot': 0.03772078169672341, 'umap_time': 15.282814979553223, 'spearman_pearson_time': 3.3044395446777344, 'load_model_time': 9.289819955825806, 'encode_time': 65.91780662536621, 'umap_param': {'n_components': 2, 'n_neighbors': 2}, 'model_name': 'paraphrase-multilingual-MiniLM-L12-v2'}
   umap_parameter: {'n_components': 2, 'n_neighbors': 4}
   umap_time: 10.082048892974854
2
{'spearman_cosine': 0.08693779369137991, 'pearson_cosine': 0.049801272539258395, 'spearman_dot': -0.005294216322368499, 'pearson_dot': 0.0006017408904502577, 'umap_time': 10.082048892974854, 'spearman_pearson_time': 3.1112375259399414, 'load_model_time': 9.289819955825806, 'encode_time': 65.91780662536621, 'umap_param': {'n_components': 2, 'n_neighbors': 4}, 'model_name': 'paraphrase-multilingual-MiniLM-L12-v2'}
   umap_parameter: {'n_components': 2, 'n_ne

In [67]:
model_result_umap

{'distiluse-base-multilingual-cased-v2': [],
 'paraphrase-multilingual-MiniLM-L12-v2': [{'spearman_cosine': 0.03258369417864273,
   'pearson_cosine': 0.029536174354480502,
   'spearman_dot': 0.031962167474619906,
   'pearson_dot': 0.03772078169672341,
   'umap_time': 15.282814979553223,
   'spearman_pearson_time': 3.3044395446777344,
   'load_model_time': 9.289819955825806,
   'encode_time': 65.91780662536621,
   'umap_param': {'n_components': 2, 'n_neighbors': 2},
   'model_name': 'paraphrase-multilingual-MiniLM-L12-v2'},
  {'spearman_cosine': 0.08693779369137991,
   'pearson_cosine': 0.049801272539258395,
   'spearman_dot': -0.005294216322368499,
   'pearson_dot': 0.0006017408904502577,
   'umap_time': 10.082048892974854,
   'spearman_pearson_time': 3.1112375259399414,
   'load_model_time': 9.289819955825806,
   'encode_time': 65.91780662536621,
   'umap_param': {'n_components': 2, 'n_neighbors': 4},
   'model_name': 'paraphrase-multilingual-MiniLM-L12-v2'},
  {'spearman_cosine': 0.1

In [68]:
model_result_umap_temp = model_result_umap.copy()

In [69]:
result_list = []
for key, value in model_result_umap_temp.items():
    # model_result_umap_temp[key]['model_name'] = key
    result_list.extend(model_result_umap_temp[key])
    
df_result_all = pd.DataFrame(result_list)
df_result_all.to_csv('spearman_pearson_umap_output_en.csv',index=False, sep='\t')

#### Debug

In [70]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [37]:
# T

In [71]:
X = model.encode(T, convert_to_tensor=True)

In [72]:
cos_sim = util.cos_sim(X, X)
dot_products = util.dot_score(X,X)

cosine_scores = torch.reshape(cos_sim, (labels.shape[0], ))
dot_products = torch.reshape(dot_products, (labels.shape[0], ))

In [75]:
i = 100
n = 30
a = cos_sim[i,:].numpy()
index = np.argsort(a)[::-1][:n]

In [76]:
df = df_hscode4.iloc[index]
df['similarity'] = a[index]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'] = a[index]


Unnamed: 0,HSCode2,hscode4_text,similarity
100,11,Starch and Inulin,1.0
1320,11,Starches; inulin.,0.907254
1607,35,Dextrins and other modified starches (e.g. : p...,0.494198
399,35,"dextrin and other modified starches (e.g., pre...",0.482555
158,19,Tapioca products and substitutes for tapioca p...,0.455093
1404,23,Residues of starch manufacture and similar res...,0.452989
1280,7,"Manioc, arrowroot, salep, jerusalem artichokes...",0.452308
187,23,"starch and similar residues; beet pulp, bagass...",0.443659
1375,19,Tapioca and substitutes therefor prepared from...,0.442449
60,7,"manioc, thirty roots, salep, Jerusalem articho...",0.409382
