In [None]:
#!pip install pandas
#!pip install numpy
#!pip install scikit-learn
#!pip install huggingface_hub
#!pip install transformers
#!pip install torch==2.0.0 torchvision==0.15.0 torchaudio==2.0.1
#!pip install accelerate>=0.26.0
#!pip install huggingface_hub

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel
from transformers import LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import torch
import huggingface_hub
import time
from sklearn.metrics import precision_score, recall_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
df_int=pd.read_csv("Data_Internal.csv")
df_ext=pd.read_csv("Data_External.csv")

In [3]:
df_int.head(10)


Unnamed: 0,NAME,OCS_NAME,LONG_NAME
0,3 Mskt DkChocMnt 1.24oz,3 Mskt DkChocMnt 1.24oz,3 Musketeers Dark Chocolate Mint (1.24oz)
1,,Costco Choc Mini 4.69lb,Costco Chocolate Mini (4.69lb)
2,Dove Dk Choc Bars 1.3oz,Dove Dk Choc Bars 1.3oz,Dove Dark Chocolate Bars (1.3oz)
3,Fishers ChocPnut 3.5oz,Fishers ChocPnut 3.5oz,Fisher's Chocolate Peanuts (3.5oz)
4,HariboGummiGoldBear2oz,HariboGummiGoldBear2oz,Haribo Gummi Gold-Bears (2oz)
5,,Heide Gummi Bears 1.4oz,Heide Gummi Bears (1.4oz)
6,Hrshy Mlk Choc 2.1oz,Hrshy Mlk Choc 2.1oz,Hersheys Milk Chocolate (2.1oz)
7,Hrshy Choc Almd 1.45oz,Hrshy Choc Almd 1.45oz,Hersheys Milk Chocolate with Almonds (1.45oz)
8,Hrshy Choc Almd 1.85oz,Hrshy Choc Almd 1.85oz,Hersheys Milk Chocolate with Almonds (1.85oz)
9,Hrshy Milk Duds 1.85oz,Hrshy Milk Duds 1.85oz,Hersheys Milk Duds (1.85oz)


In [4]:
df_ext.head(10)

Unnamed: 0,PRODUCT_NAME,UNIT_OF_MEASURE
0,5 HOUR XTRA GRAPE 1.93 OZ,1
1,B - PB & HONEY SAMMICH,1
2,B - RUDY FARMS - SAUSAGE AND BISCUIT TWIN,1
3,BANANAS - FRESH,1
4,BOBOS PB&J GRAPE 2.1 OZ,1
5,BODY ARMOR STRWBRY BANANA 16 OZ,1
6,BR ESPRESSO W/ CREAM 11 OZ,1
7,Bumble Bee Tuna Salad 3.5oz,1
8,CELSIUS ORANGE ENERGY 12 OZ,1
9,CELSIUS PEACH VIBE 12 OZ,1


In [5]:
# Standardize the text
df_int['LONG_NAME']=df_int['LONG_NAME'].str.lower().str.strip()
df_ext['PRODUCT_NAME']=df_ext['PRODUCT_NAME'].str.lower().str.strip()

In [None]:
#model_name = "gpt2"  
#model_name = "distilbert-base-uncased"
#model_name = "facebook/bart-large"

In [6]:
#local_model_path = "/root/.cache/huggingface/hub/meta-llama/Llama-2-7b"
#local_model_path = "/Users/shilpasingh/.cache/huggingface/hub/models--meta-llama--Llama-2-7b"


base_path = "/Users/shilpasingh/.cache/huggingface/hub/models--meta-llama--Llama-2-7b"
snapshot_path = os.path.join(base_path, "snapshots")

# Get the latest snapshot directory
latest_snapshot = sorted(os.listdir(snapshot_path))[-1]  # Pick the latest one
local_model_path = os.path.join(snapshot_path, latest_snapshot)

In [7]:
# Load LLaMA Model & Tokenizer from Hugging Face
model_name = "gpt2"  
tokenizer=AutoTokenizer.from_pretrained(model_name,use_fast=True)
model=AutoModel.from_pretrained(model_name, torch_dtype=torch.float16, device_map="cpu")

#tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b")
#tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# # Load the model
# model = AutoModelForCausalLM.from_pretrained(
#     local_model_path,
#     torch_dtype=torch.float16,  # Change to torch.float32 if needed
#     device_map="auto"
# )



  Referenced from: <58F76EBD-0E69-3A37-BBFE-1B46CEF60F63> /Users/shilpasingh/Desktop/STUDY/2_TECHNICAL/CODE/CODE FILES/data_scientist_project/env3.10/lib/python3.10/site-packages/torchvision/image.so
  warn(


In [8]:
tokenizer.pad_token = tokenizer.eos_token

In [9]:
def clean_text(text):
    text=str(text).lower().strip()
    text=re.sub(r'[^a-zA-Z0-9 ]','',text) # Remove Special characters
    text=re.sub(r'\s+',' ',text) # Remove extra spaces
    return text

In [10]:
# Apply cleaning to product names
df_ext['PRODUCT_NAME']=df_ext['PRODUCT_NAME'].apply(clean_text)
df_int['LONG_NAME']=df_int['LONG_NAME'].apply(clean_text)

In [11]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # Ensure model is on CPU and perform inference
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy()  # Get embeddings
    return embeddings


In [12]:
# Compute embedding for internal product names
start_time=time.time()
internal_embeddins=np.vstack([get_embedding(name) for name in df_int['LONG_NAME']])
print(f"Time taken: {time.time() - start_time} seconds")

Time taken: 9161.032320976257 seconds


In [13]:
def find_best_match_bart(query, internal_embeddins, threshold=0.7):
    query_embedding = get_embedding(query)
    similarities = cosine_similarity(query_embedding, internal_embeddins).squeeze()
    best_idx = int(similarities.argmax())
    return df_int.iloc[best_idx]["LONG_NAME"] if similarities[best_idx] > threshold else None

In [14]:
# Apply matching
matches = []
true_labels = []
predicted_labels = []

for ext_product in df_ext["PRODUCT_NAME"]:
    bart_match = find_best_match_bart(ext_product, internal_embeddins)
    matches.append((ext_product, bart_match))
    true_labels.append(1 if ext_product in df_int["LONG_NAME"].values else 0)
    predicted_labels.append(1 if bart_match else 0)


In [17]:
# Convert result to Dataframe

matched_df = pd.DataFrame(matches, columns=["External_Product_Name", "Matched_Internal_Name"])
matched_df.to_csv("Matched_Results.csv", index=False)
print("Matching completed. Results saved in Matched_Results.csv.")

Matching completed. Results saved in Matched_Results.csv.


In [16]:

# Calculate Precision and Recall
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}")

Precision: 0.0000, Recall: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
