In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
import time
import umap
import umap.plot
import time
import itertools

from utils import prepare_dataset_en, find_spearman_pearson

import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.WARNING,  # set 3rd party logs to warning (for hiding it)
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

#### Input

In [5]:
df_hscode4 = prepare_dataset_en()
df_hscode4

Unnamed: 0,HSCode2,hscode4_text
0,01,"Live horses, donkeys and mules"
1,01,Bovine animals live
2,01,live pig
3,01,Live sheep and goats
4,01,"Live poultry include gallus domesticus, ducks,..."
...,...,...
2422,97,"Original engravings, prints and lithographs."
2423,97,"Original sculptures and statuary, in any mater..."
2424,97,"Postage or revenue stamps, stamp-postmarks, fi..."
2425,97,Collections and collectors' pieces of zoologic...


In [6]:
df_cross = df_hscode4.merge(df_hscode4, how='cross')
df_cross.loc[ df_cross['HSCode2_x'] == df_cross['HSCode2_y'], 'score'] = 1
df_cross.loc[ df_cross['HSCode2_x'] != df_cross['HSCode2_y'], 'score'] = 0
df_cross

Unnamed: 0,HSCode2_x,hscode4_text_x,HSCode2_y,hscode4_text_y,score
0,01,"Live horses, donkeys and mules",01,"Live horses, donkeys and mules",1.0
1,01,"Live horses, donkeys and mules",01,Bovine animals live,1.0
2,01,"Live horses, donkeys and mules",01,live pig,1.0
3,01,"Live horses, donkeys and mules",01,Live sheep and goats,1.0
4,01,"Live horses, donkeys and mules",01,"Live poultry include gallus domesticus, ducks,...",1.0
...,...,...,...,...,...
5890324,97,Antiques of an age exceeding one hundred years.,97,"Original engravings, prints and lithographs.",1.0
5890325,97,Antiques of an age exceeding one hundred years.,97,"Original sculptures and statuary, in any mater...",1.0
5890326,97,Antiques of an age exceeding one hundred years.,97,"Postage or revenue stamps, stamp-postmarks, fi...",1.0
5890327,97,Antiques of an age exceeding one hundred years.,97,Collections and collectors' pieces of zoologic...,1.0


In [7]:
text_input = df_hscode4['hscode4_text'].values
labels = df_cross['score'].values

In [8]:
labels.shape

(5890329,)

In [9]:
parameters = {
    'distiluse-base-multilingual-cased-v2': {},
    'paraphrase-multilingual-MiniLM-L12-v2': {},
    'paraphrase-multilingual-mpnet-base-v2': {},  
}

model_result = {
    'distiluse-base-multilingual-cased-v2': {},
    'paraphrase-multilingual-MiniLM-L12-v2': {},
    'paraphrase-multilingual-mpnet-base-v2': {},
}

In [10]:
def run_model(model_name):
    start_time = time.time()
    model = SentenceTransformer(model_name)
    load_model_time = time.time() - start_time
    print('   load_model_time:', load_model_time)
    
    start_time = time.time()
    X = model.encode(text_input, convert_to_tensor=True)
    encode_time = time.time() - start_time
    print('   encode_time:', load_model_time)
    
    start_time = time.time()
    result = find_spearman_pearson(X, labels)
    spearman_pearson_time = time.time() - start_time
    
    result['load_model_time'] = load_model_time
    result['encode_time'] = encode_time
    result['spearman_pearson_time'] = spearman_pearson_time
    
    return result 
    

In [11]:
for key, value in parameters.items():
    print(f'--- {key} ---')
    result = run_model(key)   
    result.update({'model_name': key })
    # model_result[key] = result 
    df_result = pd.DataFrame([result])
    df_result.to_csv(f'output/semantic_similarity/no_umap_{key}.csv',index=False, sep='\t')
    # print(df_result)

--- distiluse-base-multilingual-cased-v2 ---
   load_model_time: 13.464010238647461
   encode_time: 13.464010238647461
   spearman_cosine  pearson_cosine  spearman_dot  pearson_dot  \
0         0.148135        0.238337      0.147198     0.238198   

   load_model_time  encode_time  spearman_pearson_time  \
0         13.46401    62.010487               4.264453   

                             model_name  
0  distiluse-base-multilingual-cased-v2  
--- paraphrase-multilingual-MiniLM-L12-v2 ---
   load_model_time: 2.1492221355438232
   encode_time: 2.1492221355438232
   spearman_cosine  pearson_cosine  spearman_dot  pearson_dot  \
0         0.175343        0.257246      0.164629     0.238567   

   load_model_time  encode_time  spearman_pearson_time  \
0         2.149222    34.853023               6.764616   

                              model_name  
0  paraphrase-multilingual-MiniLM-L12-v2  
--- paraphrase-multilingual-mpnet-base-v2 ---
   load_model_time: 2.5109007358551025
   encode_

#### Umap

In [13]:
umap_params = {
    'n_components': np.arange(2, 11, 1),
    'n_neighbors': [2,4,8,16,32,64],
}

parameters = {
    'distiluse-base-multilingual-cased-v2': {},
    'paraphrase-multilingual-MiniLM-L12-v2': {},
    'paraphrase-multilingual-mpnet-base-v2': {},   
    
}
model_result_umap = {
    'distiluse-base-multilingual-cased-v2': [],
    'paraphrase-multilingual-MiniLM-L12-v2': [],
    'paraphrase-multilingual-mpnet-base-v2': [],
}

In [14]:
def run_model_umap(X, umap_param):
    start_time = time.time()
    X_umap = umap.UMAP(**umap_param).fit_transform(X)
    umap_time = time.time() - start_time
    print('   umap_time:', umap_time)
    
    start_time = time.time()
    result = find_spearman_pearson(X_umap, labels)
    spearman_pearson_time = time.time() - start_time
    
    result['umap_time'] = umap_time
    result['spearman_pearson_time'] = spearman_pearson_time
    
    return result 

In [15]:
umap_keys = list(umap_params.keys())
umap_values = umap_params.values()
umap_all_combinations = list(itertools.product(*umap_values))


for key, value in parameters.items():
    print(f'--- {key} ---')
    start_time = time.time()
    model = SentenceTransformer(key)
    load_model_time = time.time() - start_time
    print('   load_model_time:', load_model_time)
    
    start_time = time.time()
    X = model.encode(text_input, convert_to_tensor=True)
    encode_time = time.time() - start_time
    print('   encode_time:', encode_time)
    
    for umap_parameter_value in umap_all_combinations:
        umap_parameter_result = {}
        for i_parameter in range(len(umap_parameter_value)):
            umap_parameter_result[umap_keys[i_parameter]] = umap_parameter_value[i_parameter]
        print('   umap_parameter:', umap_parameter_result)
        result = run_model_umap(X, umap_param=umap_parameter_result) 
        result['load_model_time'] = load_model_time
        result['encode_time'] = encode_time  
        result['umap_param'] = umap_parameter_result
        result['model_name'] = key
        model_result_umap[key].append(result) 
        print(len(model_result_umap[key]))

    df_result = pd.DataFrame(model_result_umap[key])
    df_result.to_csv(f'output/semantic_similarity/umap_{key}.csv',index=False, sep='\t')

--- distiluse-base-multilingual-cased-v2 ---
   load_model_time: 1.1418468952178955
   encode_time: 92.62573981285095
   umap_parameter: {'n_components': 2, 'n_neighbors': 2}


failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


   umap_time: 14.206292390823364
1
   umap_parameter: {'n_components': 2, 'n_neighbors': 4}
   umap_time: 9.565637588500977
2
   umap_parameter: {'n_components': 2, 'n_neighbors': 8}
   umap_time: 7.589429140090942
3
   umap_parameter: {'n_components': 2, 'n_neighbors': 16}
   umap_time: 10.42237663269043
4
   umap_parameter: {'n_components': 2, 'n_neighbors': 32}
   umap_time: 12.079277992248535
5
   umap_parameter: {'n_components': 2, 'n_neighbors': 64}
   umap_time: 9.797019004821777
6
   umap_parameter: {'n_components': 3, 'n_neighbors': 2}
   umap_time: 8.275845527648926
7
   umap_parameter: {'n_components': 3, 'n_neighbors': 4}
   umap_time: 7.881196975708008
8
   umap_parameter: {'n_components': 3, 'n_neighbors': 8}
   umap_time: 9.329438924789429
9
   umap_parameter: {'n_components': 3, 'n_neighbors': 16}
   umap_time: 8.51816725730896
10
   umap_parameter: {'n_components': 3, 'n_neighbors': 32}
   umap_time: 11.25245714187622
11
   umap_parameter: {'n_components': 3, 'n_neighb

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


   umap_time: 9.955172061920166
1
   umap_parameter: {'n_components': 2, 'n_neighbors': 4}
   umap_time: 7.246172666549683
2
   umap_parameter: {'n_components': 2, 'n_neighbors': 8}
   umap_time: 7.47386360168457
3
   umap_parameter: {'n_components': 2, 'n_neighbors': 16}
   umap_time: 7.8872857093811035
4
   umap_parameter: {'n_components': 2, 'n_neighbors': 32}
   umap_time: 8.26736330986023
5
   umap_parameter: {'n_components': 2, 'n_neighbors': 64}
   umap_time: 9.43744444847107
6
   umap_parameter: {'n_components': 3, 'n_neighbors': 2}
   umap_time: 10.824169397354126
7
   umap_parameter: {'n_components': 3, 'n_neighbors': 4}
   umap_time: 6.683725357055664
8
   umap_parameter: {'n_components': 3, 'n_neighbors': 8}
   umap_time: 7.0629260540008545
9
   umap_parameter: {'n_components': 3, 'n_neighbors': 16}
   umap_time: 8.369946718215942
10
   umap_parameter: {'n_components': 3, 'n_neighbors': 32}
   umap_time: 10.423950433731079
11
   umap_parameter: {'n_components': 3, 'n_neighb

#### Debug

In [70]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [71]:
X = model.encode(text_input, convert_to_tensor=True)

In [72]:
cos_sim = util.cos_sim(X, X)
dot_products = util.dot_score(X,X)

cosine_scores = torch.reshape(cos_sim, (labels.shape[0], ))
dot_products = torch.reshape(dot_products, (labels.shape[0], ))

In [75]:
i = 100
n = 30
a = cos_sim[i,:].numpy()
index = np.argsort(a)[::-1][:n]

In [76]:
df = df_hscode4.iloc[index]
df['similarity'] = a[index]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'] = a[index]


Unnamed: 0,HSCode2,hscode4_text,similarity
100,11,Starch and Inulin,1.0
1320,11,Starches; inulin.,0.907254
1607,35,Dextrins and other modified starches (e.g. : p...,0.494198
399,35,"dextrin and other modified starches (e.g., pre...",0.482555
158,19,Tapioca products and substitutes for tapioca p...,0.455093
1404,23,Residues of starch manufacture and similar res...,0.452989
1280,7,"Manioc, arrowroot, salep, jerusalem artichokes...",0.452308
187,23,"starch and similar residues; beet pulp, bagass...",0.443659
1375,19,Tapioca and substitutes therefor prepared from...,0.442449
60,7,"manioc, thirty roots, salep, Jerusalem articho...",0.409382
