In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from fuzzywuzzy import fuzz
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Ensure the model uses GPU if available

# File paths
file_2023 = '/home/samirk08/UROP_SPRING_2024/UROP IAP 2024/Original Databases/tariff database_202305.xlsx'
file_1990 = '/home/samirk08/UROP_SPRING_2024/1990/1000_1990.xlsx'
# manual_coding_file = '/Users/samirkadariya/Desktop/School/UROP IAP 2024/Original Databases/Manual coding.xlsx'

# Load the data
df_2023 = pd.read_excel(file_2023)
df_1990 = pd.read_excel(file_1990)
# df_manual_coding = pd.read_excel(manual_coding_file)

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
def batch_encode_descriptions(model, descriptions, batch_size=32):
    """Encode descriptions in batches to utilize GPU more efficiently."""
    all_embeddings = []
    for i in range(0, len(descriptions), batch_size):
        batch = descriptions[i:i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True)
        all_embeddings.append(batch_embeddings)
    return torch.cat(all_embeddings, dim=0)

In [4]:
# Pre-compute embeddings for the 2023 dataset to avoid redundant computation
brief_descriptions = df_2023['brief_description'].tolist()
embeddings_2023 = batch_encode_descriptions(model, brief_descriptions)

In [5]:
def find_most_similar_hs_code(description, embeddings_2023, df_2023, top_n=1):
    """Find the most similar HS code for a given description."""
    description_embedding = model.encode([description], convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(description_embedding, embeddings_2023)
    top_results = np.argsort(-cosine_scores.cpu().numpy())[0][:top_n]
    if top_results.size > 0:
        top_index = top_results[0]
        predicted_hs_code = df_2023.iloc[top_index]['hts8']
        confidence_score = cosine_scores[0][top_index].item()
        return predicted_hs_code, confidence_score
    else:
        return '', 0.0

In [6]:
def calculate_similarity_with_actual_hs_code(predicted_hs_code, actual_hs_code):
    # Ensure HS codes are strings for fuzzy matching
    predicted_hs_code_str = str(predicted_hs_code)
    actual_hs_code_str = str(actual_hs_code)
    # Calculate similarity using fuzzywuzzy
    similarity = fuzz.ratio(predicted_hs_code_str, actual_hs_code_str)
    # Convert to a scale of 0 to 1 (fuzz.ratio returns a value between 0 and 100)
    return similarity / 100.0

In [7]:
def process_item_and_predict_hs_code(row, embeddings_2023, df_2023):
    """Process each item and predict HS code."""
    combined_description = f"{row['ProductDescription']}".strip()
    predicted_hs_code, confidence_score = find_most_similar_hs_code(combined_description, embeddings_2023, df_2023)
    return {
        '1990 Item': row['ProductDescription'],
        'Predicted HS Code': predicted_hs_code,
        'Confidence Score': confidence_score
    }

In [8]:
def match_and_export_parallel(df_1990, embeddings_2023, df_2023, output_file_path):
    """Match items and export results in parallel."""
    with ThreadPoolExecutor(max_workers=50) as executor:
        process_func = partial(process_item_and_predict_hs_code, embeddings_2023=embeddings_2023, df_2023=df_2023)
        futures = [executor.submit(process_func, row) for _, row in df_1990.iterrows()]
        export_data = [future.result() for future in as_completed(futures)]

    export_df = pd.DataFrame(export_data)
    export_df.to_csv(output_file_path, index=False)

In [9]:
# Output file path
output_csv_path = 'HF_1990_Sample.csv'
match_and_export_parallel(df_1990, embeddings_2023, df_2023, output_csv_path)