In [1]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [2]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma(
    collection_name="collection2",
    embedding_function=embeddings,
    persist_directory="./chrome_vector_db",
)

In [3]:
from pydantic import BaseModel, Field
from typing import Optional

class Attributes(BaseModel):
    length: Optional[float] = Field(default=None, description="length in inches. if the name uses different units, convert to inches")
    gauge: Optional[int] = Field(default=None, description="gauge in inches. for example, 20ga or 20g refers to 20 gauge") 

In [4]:
import pandas as pd

catalog_df = pd.read_csv("documents/ProductDB_clean.csv")

In [5]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4o-mini-2024-07-18", temperature=0)
structured_llm = model.with_structured_output(Attributes)

In [6]:
from langchain_core.prompts.chat import ChatPromptTemplate
get_attr_prompt_template = ChatPromptTemplate.from_template("Parse all given attributes from the given product name. If the attribute isn't present in the name, or you are unsure, then leave it as None. If a measurement is written like 2-1/2IN, then the measurement is 2.5 in. \nProduct Name: {product_name}")

In [7]:
productdb_df = pd.read_csv('documents/productdb_subset.csv')

In [10]:
from langchain_core.documents import Document
for index, row in productdb_df.iterrows():
    product_name = str(row["NAME"])
    alias = str(row["ALIAS"])
    name = product_name + " " + alias
    get_attr_prompt = get_attr_prompt_template.invoke({'product_name': name})
    parsed_attributes = structured_llm.invoke(get_attr_prompt)
    attr_dict = dict(parsed_attributes)
    metadata = {key:val for key, val in attr_dict.items() if val is not None}
    metadata['PRODUCT_NO'] = row["PRODUCT_NO"]
    document = Document(page_content=name, metadata=metadata)
    vector_store.add_documents([document])
    print(f"Added document {index+1}/{len(productdb_df)} with name {name} and attributes length {attr_dict['length']} and gauge {attr_dict['gauge']}")

Added document 1/22 with name STRUCTURAL STUD 3-5/8IN 18G 1-5/8IN FL 362S162-43 (18G) and attributes length 1.625 and gauge 18
Added document 2/22 with name STRUCTURAL STUD 6IN 18G 1-5/8IN FL 600S162-43 (18G) and attributes length 6.0 and gauge 18
Added document 3/22 with name ProSTUD20 1-5/8IN DRYWALL STUD 162PDS125-18 (20EQ) and attributes length 1.625 and gauge 20
Added document 4/22 with name ProSTUD20 3-5/8IN DRYWALL STUD 362PDS125-18 (20EQ) and attributes length 3.625 and gauge 20
Added document 5/22 with name ProSTUD20 6IN DRYWALL STUD 600PDS125-18 (20EQ) and attributes length 6.0 and gauge 20
Added document 6/22 with name ProTRAK20 1-5/8IN DW TRACK 1-1/4IN LEG 162PDT125-18 (20EQ) and attributes length 1.625 and gauge 20
Added document 7/22 with name ProTRAK20 3-5/8IN DW TRACK 1-1/4IN LEG 362PDT125-18 (20EQ) and attributes length 3.625 and gauge 20
Added document 8/22 with name ProTRAK20 6IN DRYWALL TRACK 1-1/4IN LEG 600PDT125-18 (20EQ) and attributes length 6.0 and gauge 20
Add