In [4]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [5]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma(
    collection_name="collection2",
    embedding_function=embeddings,
    persist_directory="./chrome_vector_db",
)

In [6]:
from pydantic import BaseModel, Field
from typing import Optional

class Attributes(BaseModel):
    length: Optional[float] = Field(default=None, description="length in inches. if the name uses different units, convert to inches")
    gauge: Optional[int] = Field(default=None, description="gauge in inches. for example, 20ga or 20g refers to 20 gauge") 

In [7]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate

model = ChatOpenAI(model="gpt-4o-mini-2024-07-18", temperature=0)
structured_llm = model.with_structured_output(Attributes)
get_attr_prompt_template = ChatPromptTemplate.from_template("Parse all given attributes from the given product name. If the attribute isn't present in the name, or you are unsure, then leave it as None. If a measurement is written like 2-1/2IN, then the measurement is 2.5 in. \nProduct Name: {product_name}")

In [14]:
product = "[CB] Corner Bead - paper faced, 10'"
get_attr_prompt = get_attr_prompt_template.invoke({'product_name': product})
parsed_attributes = structured_llm.invoke(get_attr_prompt)
attr_dict = dict(parsed_attributes)
filter = {key:{'$eq':val} for key, val in attr_dict.items() if val is not None}
results = vector_store.similarity_search(
    product,
    filter = filter,
    k=10,
)

In [15]:
filter

{'length': {'$eq': 120.0}}

In [16]:
results

[Document(id='bfa18b42-43d4-4260-9da2-f24c956a0621', metadata={'PRODUCT_NO': 534360, 'length': 120.0}, page_content='10FT PAPER-FACED BULLNOSE CORNER BEAD nan'),
 Document(id='d57cb0d4-3a34-40c2-9870-64afbbb77d88', metadata={'PRODUCT_NO': 43403, 'length': 120.0}, page_content='#1A EXPANDED CORNER BEAD 10FT nan'),
 Document(id='49a4b8ed-1140-4a5a-81ff-77152ff5bad9', metadata={'PRODUCT_NO': 43407, 'gauge': 2, 'length': 120.0}, page_content='#2A EXPANDED CORNER BEAD 10FT nan'),
 Document(id='4ae15ec2-2a67-4ea6-8033-2554fe84c6f7', metadata={'PRODUCT_NO': 219681, 'length': 120.0}, page_content='10FT VINYL DRYWALL CORNER BEAD nan'),
 Document(id='bc9663d8-6fe4-4dd4-8484-410410c1113a', metadata={'PRODUCT_NO': 202564, 'length': 120.0}, page_content='10FT GALVANIZED DRYWALL CORNER BEAD nan'),
 Document(id='9677c0a1-22b8-4f8e-b13d-6c1389be26db', metadata={'PRODUCT_NO': 437481, 'length': 120.0}, page_content='10FT VINYL BULLNOSE CORNER BEAD nan'),
 Document(id='8118098c-3abc-4fef-be0f-a3a8eea2b1f

In [17]:
import pandas as pd
labels_df = pd.read_csv('documents/Request-Response 1/labels.csv')

In [39]:
def prediction(product: str):
    get_attr_prompt = get_attr_prompt_template.invoke({'product_name': product})
    parsed_attributes = structured_llm.invoke(get_attr_prompt)
    attr_dict = dict(parsed_attributes)
    filter = {'$or': [{key:{'$eq':val}} for key, val in attr_dict.items() if val is not None]}
    results = vector_store.similarity_search(
        product,
        k=10,
    )
    res = [name.metadata['PRODUCT_NO'] for name in results]
    return res

In [40]:
guesses = []
for index, row in labels_df.iterrows():
    print(index)
    product = row['Customer Name']
    pred = prediction(product)
    guesses.append(pred)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [41]:
labels_df['Predictions'] = guesses

In [42]:
labels_df

Unnamed: 0,Customer Name,Product No,Predictions
0,"[CB] Corner Bead - paper faced, 10'",37506,"[182624, 534298, 534360, 309122, 432623, 43403..."
1,"[T35820] 3-5/8"" 20ga Track, 10'",264033,"[541212, 200144, 55385, 55352, 55353, 55354, 2..."
2,"[S35820] 3-5/8"" 20ga Stud",261033,"[541193, 541128, 420204, 200141, 423855, 43053..."
3,"[T35820SLOTTED] 3-5/8"" 20ga Slotted track, 10'",76033,"[541212, 200144, 55385, 55352, 55353, 55354, 4..."
4,"[T620] 6"" 20ga Track, 10'",264063,"[541212, 4063, 200144, 55385, 40203, 4013, 301..."
5,"[S620] 6"" 20ga Stud",261063,"[541128, 541193, 420204, 200141, 1043, 261063,..."
6,"[T620] 6"" 20ga Track, 10'",76063,"[541212, 4063, 200144, 55385, 40203, 4013, 301..."
7,"[T15820] 1-5/8"" 20ga Track, 10'",264013,"[200144, 541212, 3013, 4013, 445184, 228997, 5..."
8,"[S15820] 1-5/8"" 20ga Stud",261013,"[200141, 420204, 541128, 541193, 1013, 230867,..."
9,"[T15820] 1-5/8"" 20ga Track, 10'",266013,"[200144, 541212, 3013, 4013, 445184, 228997, 5..."


In [43]:
def calculate_topk_accuracy(df, k):
    correct = 0
    for index, row in df.iterrows():
        if row['Product No'] in row['Predictions'][:k]:
            correct += 1
    return correct / len(df)

## Top 10 accuracy

In [44]:
calculate_topk_accuracy(labels_df, 10)

0.16

## Top 5 accuracy

In [38]:
calculate_topk_accuracy(labels_df, 5)

0.08