In [1]:
import pandas as pd
import numpy as np
import os
import openai
from openai import OpenAI
from tenacity import retry, wait_exponential, stop_after_attempt
from concurrent.futures import ThreadPoolExecutor, as_completed  
import logging
from sentence_transformers import SentenceTransformer, util
from functools import partial
from fuzzywuzzy import fuzz
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Assuming the GPU is available, ensure PyTorch uses it.
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

os.environ["TOKENIZERS_PARALLELISM"] = "false"
api_key = ""  
os.environ["OPENAI_API_KEY"] = api_key
client = OpenAI()

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
import base64
import requests


# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Path to your image
image_path = "/home/samirk08/UROP_SPRING_2024/UserInput/brown husked rice.png"

# Getting the base64 string
base64_image = encode_image(image_path)

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

payload = {
  "model": "gpt-4-vision-preview",
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "Please provide a concise description of the primary item in this image, focusing on its identifiable and classifiable features relevant for customs and tariff purposes. Use no more than 10 words."
        },
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{base64_image}"
          }
        }
      ]
    }
  ],
  "max_tokens": 300
}

response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
response_data = response.json()

# Extracting and printing only the description from the response
user_input = response_data['choices'][0]['message']['content']
print(user_input)


Brown rice grains, uncooked, agricultural produce.


In [4]:

df_2023 = pd.read_excel("/home/samirk08/UROP_SPRING_2024/UROP IAP 2024/Original Databases/tariff database_202305.xlsx")
brief_descriptions = df_2023['brief_description'].tolist()

# pre-compute embeddings for the 2023 dataset
embeddings_2023 = torch.load("/home/samirk08/UROP_SPRING_2024/UserInput/embeddings.pt").to(device)

In [5]:
@retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(50))
def ask_gpt(prompt, system_prompt, model_name="gpt-4"):
    response = client.chat.completions.create(model=model_name,
                                              messages=[
                                                  {"role": "system", "content": system_prompt},
                                                  {"role": "user", "content": prompt}
                                              ],
                                              max_tokens=300,
                                              temperature=0.0)
    return response.choices[0].message.content.strip()

In [6]:
def calculate_similarity(description, embeddings_2023, df_2023):
    description_embedding = model.encode(description, convert_to_tensor=True).to(device)
    cosine_scores = util.pytorch_cos_sim(description_embedding, embeddings_2023)
    
    top_result = torch.argmax(cosine_scores, dim=1)
    matched_hs_code = df_2023.iloc[top_result.item()]['hts8']
    similarity_score = cosine_scores[0, top_result.item()].item()
    matched_description = df_2023.iloc[top_result.item()]['brief_description']  # Fetch the associated description
    
    return matched_hs_code, similarity_score, matched_description


In [7]:
def process_and_compare(user_input):
    # enhance description with GPT
    system_prompt = "Enhance this product description to be more detailed and specific for tariff classification purposes:"
    enhanced_description_gpt = ask_gpt(user_input, system_prompt)
    
    # calculate similarity scores with GPT-enhanced description
    gpt_hs_code, gpt_similarity_score, gpt_matched_description = calculate_similarity(enhanced_description_gpt, embeddings_2023, df_2023)
    
    # calculate similarity scores directly with user input using HF model
    hf_hs_code, hf_similarity_score, hf_matched_description = calculate_similarity(user_input, embeddings_2023, df_2023)
    
    # compare and choose the highest similarity score
    if gpt_similarity_score > hf_similarity_score:
        chosen_hs_code = gpt_hs_code
        final_similarity_score = gpt_similarity_score
        method_used = 'GPT'
        chosen_description = gpt_matched_description
    else:
        chosen_hs_code = hf_hs_code
        final_similarity_score = hf_similarity_score
        method_used = 'HF'
        chosen_description = hf_matched_description
    
    return chosen_hs_code, final_similarity_score, method_used, chosen_description


In [8]:
# user_input = input("Enter a tariff description: ")
chosen_hs_code, final_similarity_score, method_used, chosen_description = process_and_compare(user_input)

print(f"Method Used: {method_used}")
print(f"Matched HS Code: {chosen_hs_code}, Similarity Score: {final_similarity_score}")
print(f"Matched Description: {chosen_description}")


Method Used: HF
Matched HS Code: 10062040, Similarity Score: 0.6716567277908325
Matched Description: Husked (brown) rice, other than Basmati
