In [9]:
##IMPORTS##

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import f1_score

In [10]:
## Initialize Transformer and pdf files ##
model = SentenceTransformer('all-MiniLM-L6-v2')

file_2023 = '/Users/samirkadariya/Desktop/School/UROP IAP 2024/Original Databases/tariff database_202305.xlsx'
file_1990 = '/Users/samirkadariya/Desktop/School/UROP IAP 2024/Original Databases/1990_CUT.xlsx' 

df_2023 = pd.read_excel(file_2023)
df_1990 = pd.read_excel(file_1990)  # Load 1990 data

In [11]:
def find_similar_hs_codes_transformers(df_1990, df_2023, top_n=1):
    # Combining item and description fields for 1990 data
    df_1990['combined_description'] = df_1990['ProductDescription'].fillna('')

    # Compute embeddings for each description
    embeddings_1990 = model.encode(df_1990['combined_description'].tolist(), convert_to_tensor=True)
    embeddings_2023 = model.encode(df_2023['brief_description'].tolist(), convert_to_tensor=True)

    # Calculate cosine similarities
    cosine_scores = util.pytorch_cos_sim(embeddings_1990, embeddings_2023)

    # Find the top N similar HS codes for each 1990 item
    hs_code_matches = []
    for i in range(len(df_1990)):
        top_results = np.argsort(-cosine_scores[i].cpu().numpy())[:top_n]
        matched_hs_codes = [(df_2023.iloc[j]['hts8'], cosine_scores[i][j].item()) for j in top_results]
        hs_code_matches.append(matched_hs_codes)

    return hs_code_matches

In [12]:
def calculate_f1_scores(y_true, y_pred, digit_level=10):
    # Truncate HS codes to the specified digit level
    y_true_truncated = [str(code)[:digit_level] for code in y_true]
    y_pred_truncated = [str(code)[:digit_level] for code in y_pred]

    # Calculate F1 score
    return f1_score(y_true_truncated, y_pred_truncated, average='weighted')

In [13]:
def match_and_export_hs_codes_transformers(df_1990, df_2023, output_file_path):
    all_matches = find_similar_hs_codes_transformers(df_1990, df_2023)

    export_data = []
    for i, matches in enumerate(all_matches):
        for hs_code, score in matches:
            df_2023_row = df_2023[df_2023['hts8'] == hs_code].iloc[0]
            original_hs_code = df_1990.iloc[i]['ProductCode']  # Replace with actual column name
            f1_hs10 = calculate_f1_scores([original_hs_code], [hs_code], 10)
            f1_hs6 = calculate_f1_scores([original_hs_code], [hs_code], 6)
            f1_hs4 = calculate_f1_scores([original_hs_code], [hs_code], 4)

            export_data.append({
                '1990 Product': df_1990.iloc[i]['ProductDescription'],
                'Original HS Code': original_hs_code,
                'Matched HS Code': hs_code,
                '2023 Description': df_2023_row['brief_description'],
                'F1 Score HS10': f1_hs10,
                'F1 Score HS6': f1_hs6,
                'F1 Score HS4': f1_hs4,
                'Similarity Score': score
            })

    export_df = pd.DataFrame(export_data)
    export_df.to_csv(output_file_path, index=False)

    return export_df

In [14]:
output_csv_path = 'matched_hs_codes_1990_to_2023_transformers.csv'
exported_df_transformers = match_and_export_hs_codes_transformers(df_1990, df_2023, output_csv_path)
print(exported_df_transformers.head())

                                        1990 Product  Original HS Code  \
0  Live ducks, geese, turkeys and guineas, weighi...           1051900   
1  Live swine, other than purebred breeding swine...           1039100   
2      Live asses other than purebred breeding asses           1012020   
3                     Live birds, other than poultry           1060010   

   Matched HS Code                                   2023 Description  \
0          1059900  Live ducks, geese, turkeys and guineas, weighi...   
1          1039100  Live swine, other than purebred breeding swine...   
2          1013000                                         Live asses   
3          1063901  Live birds, other than poultry, birds of prey ...   

   F1 Score HS10  F1 Score HS6  F1 Score HS4  Similarity Score  
0            0.0           0.0           0.0          0.986649  
1            1.0           1.0           1.0          1.000000  
2            0.0           0.0           0.0          0.631173  
3  