In [1]:
import os
import openai
os.environ["OPENAI_API_KEY"] = "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"
openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
from sqlalchemy import create_engine, Column, Integer, String, Float, ForeignKey, Table
from sqlalchemy.orm import declarative_base, sessionmaker, relationship
import pandas as pd

def create_bulkupload_db(db_path, csv_path):
    Base = declarative_base()

    # Association tables
    part_make = Table(
        'part_make', Base.metadata,
        Column('part_id', Integer, ForeignKey('parts.id')),
        Column('make_id', Integer, ForeignKey('makes.id'))
    )

    part_models = Table(
        'part_models', Base.metadata,
        Column('part_id', Integer, ForeignKey('parts.id')),
        Column('model_id', Integer, ForeignKey('models.id'))
    )

    part_years = Table(
        'part_years', Base.metadata,
        Column('part_id', Integer, ForeignKey('parts.id')),
        Column('year_id', Integer, ForeignKey('years.id'))
    )

    class Part(Base):
        __tablename__ = 'parts'
        id = Column(Integer, primary_key=True, autoincrement=True)
        name = Column(String)
        category = Column(String)
        part_number = Column(String, nullable=False, unique=True)
        description = Column(String)
        additional_details = Column(String)
        brand = Column(String)
        quantity = Column(Integer)
        price = Column(Float)
        unit_cost = Column(Float)
        months_no_sale = Column(Integer)

        makes = relationship("Make", secondary=part_make, back_populates="parts", cascade="all, delete")
        models = relationship("Model", secondary=part_models, back_populates="parts", cascade="all, delete")
        years = relationship("Year", secondary=part_years, back_populates="parts", cascade="all, delete")

    class Make(Base):
        __tablename__ = 'makes'
        id = Column(Integer, primary_key=True, autoincrement=True)
        name = Column(String, nullable=False, unique=True)

        parts = relationship("Part", secondary=part_make, back_populates="makes")

    class Model(Base):
        __tablename__ = 'models'
        id = Column(Integer, primary_key=True, autoincrement=True)
        name = Column(String, nullable=False, unique=True)

        parts = relationship("Part", secondary=part_models, back_populates="models")

    class Year(Base):
        __tablename__ = 'years'
        id = Column(Integer, primary_key=True, autoincrement=True)
        year = Column(Integer, nullable=False)

        parts = relationship("Part", secondary=part_years, back_populates="years")

    # Initialize database
    engine = create_engine(f'sqlite:///{db_path}')
    Base.metadata.drop_all(engine)
    Base.metadata.create_all(engine)

    Session = sessionmaker(bind=engine)
    session = Session()

    def get_or_create(model, **kwargs):
        instance = session.query(model).filter_by(**kwargs).first()
        if instance:
            return instance
        else:
            instance = model(**kwargs)
            session.add(instance)
            session.flush()
            return instance

    def build_parts_table(parts_df):
        for _, row in parts_df.iterrows():
            new_part = Part(
                name=row.get('name'),
                part_number=row['part_number'],
                category=row.get('category'),
                description=row.get('description'),
                additional_details=row.get('additional_details'),
                quantity=row.get('quantity'),
                price=row.get('price'),
                unit_cost=row.get('cost_per_unit'),
                months_no_sale=row.get('months_no_sale')
            )
            session.add(new_part)
            session.flush()

            # Handle makes
            makes = [make.strip() for make in str(row.get('make', '')).split(',') if make.strip()]
            for make_name in makes:
                make = get_or_create(Make, name=make_name)
                new_part.makes.append(make)

            # Handle models
            models = [model.strip() for model in str(row.get('model', '')).split(',') if model.strip()]
            for model_name in models:
                model = get_or_create(Model, name=model_name)
                new_part.models.append(model)

            # Handle years
            years = str(row.get('year', '')).replace(' ', '').split(',')
            for year_range in years:
                if '-' in year_range:
                    start, end = map(int, year_range.split('-'))
                    for year in range(start, end + 1):
                        year_instance = get_or_create(Year, year=year)
                        new_part.years.append(year_instance)
                elif year_range.isdigit():
                    year_instance = get_or_create(Year, year=int(year_range))
                    new_part.years.append(year_instance)

        session.commit()

    # Load the CSV into a DataFrame
    parts_df = pd.read_csv(csv_path)
    parts_df = parts_df.drop_duplicates(subset=['part_number'])

    # Build the parts table
    build_parts_table(parts_df)

    session.close()

    return



In [3]:
db_path = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/notebooks/bulk_upload.db"
csv_path = "/Users/skylerwilson/Desktop/PartsWise/Data/test_data/alberta_honda_data_synthetic_v2.csv"

create_bulkupload_db(db_path, csv_path)

In [4]:
from sqlalchemy import inspect
engine = create_engine(f'sqlite:///{db_path}')
inspector = inspect(engine)
print(inspector.get_table_names())

['makes', 'model_year', 'models', 'part_make', 'part_model', 'part_models', 'part_year', 'part_years', 'parts', 'years']


In [5]:
from sqlalchemy import create_engine
from llama_index.core.objects import SQLTableNodeMapping, ObjectIndex, SQLTableSchema
from llama_index.core import VectorStoreIndex, SQLDatabase
from llama_index.core.retrievers import SQLRetriever
from llama_index.llms.openai import OpenAI

# Database connection
db_path = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/notebooks/bulk_upload.db"
engine = create_engine(f'sqlite:///{db_path}')

# Create SQLDatabase object
sql_database = SQLDatabase(engine)
llm = OpenAI(model="gpt-4o-mini-2024-07-18")

# Create table node mapping and schema objects
table_node_mapping = SQLTableNodeMapping(sql_database)
table_schema_objs = [
    SQLTableSchema(table_name="parts", context_str="Table containing automotive parts data including id, name, category, part_number, description, additional_details, brand, quantity, price, unit_cost, months_no_sale."),
    SQLTableSchema(table_name="makes", context_str="Table containing car makes with id and name."),
    SQLTableSchema(table_name="models", context_str="Table containing car models with id and name."),
    SQLTableSchema(table_name="years", context_str="Table containing years with id and year."),
    SQLTableSchema(table_name="part_make", context_str="Association table between parts and makes with part_id and make_id."),
    SQLTableSchema(table_name="part_models", context_str="Association table between parts and models with part_id and model_id."),
    SQLTableSchema(table_name="part_years", context_str="Association table between parts and years with part_id and year_id."),
]

# Create object index for table schemas
obj_index = ObjectIndex.from_objects(
    table_schema_objs,
    table_node_mapping,
    VectorStoreIndex,
)
obj_retriever = obj_index.as_retriever(similarity_top_k=7)

In [6]:
from typing import List

def get_table_context_str(table_schema_objs: List[SQLTableSchema]):
    """Get table context string."""
    context_strs = []
    for table_schema_obj in table_schema_objs:
        table_info = sql_database.get_single_table_info(
            table_schema_obj.table_name
        )
        if table_schema_obj.context_str:
            table_opt_context = " The table description is: "
            table_opt_context += table_schema_obj.context_str
            table_info += table_opt_context

        context_strs.append(table_info)
    return "\n\n".join(context_strs)

In [13]:
from llama_index.core import PromptTemplate
from llama_index.core.llms import ChatResponse


def parse_response_to_sql(chat_response: ChatResponse) -> str:
    """Parse response to SQL."""
    response = chat_response.message.content
    sql_query_start = response.find("SQLQuery:")
    if sql_query_start != -1:
        response = response[sql_query_start:]
        # TODO: move to removeprefix after Python 3.9+
        if response.startswith("SQLQuery:"):
            response = response[len("SQLQuery:") :]
    sql_result_start = response.find("SQLResult:")
    if sql_result_start != -1:
        response = response[:sql_result_start]
    return response.strip().strip("```").strip()


text2sql_prompt = PromptTemplate("""
    Given the following SQL tables:
    {schema}

    Generate a SQL query to answer the following question:
    {query_str}

    SQL Query Guidelines:
    1. ALWAYS include these output columns in this order: name, category, part_number, description, additional_details, quantity, price, year, make, model.
    2. Use LEFT JOINs to ensure all parts are included, even if they don't have associated makes, models, or years.
    3. Use GROUP_CONCAT with DISTINCT when aggregating model, make, and year values.
    4. JOINs should follow this pattern: parts -> part_make -> makes, parts -> part_models -> models, parts -> part_years -> years.
    5. Use direct queries and only use JOINs when needed.
    6. Prefer filtering directly on 'parts' when possible.
    7. Use literal values; no placeholders.
    8. Use single quotes (') for string literals.
    9. For percentage calculations: decrease by X% = price * (1 - X/100), increase by X% = price * (1 + X/100).
    10. Use LOWER() + LIKE '%example%' for case-insensitive search.
    11. Use WHERE + AND/OR for precise filtering.
    12. Include all makes, models, and years a part fits when querying parts.
    13. When querying for parts that fit a specific model (e.g., Pilot), return ALL parts that fit the model, INCLUDING those that also fit other models. The specified model should be one of the compatible models, not necessarily the only model.
    14. Use EXISTS or IN subqueries to check if a part fits a specific model without excluding parts that fit multiple models.
    15. ONLY use tables: 'parts', 'makes', 'models', 'years', 'part_years', 'part_make', 'part_models'.
    16. ONLY use column names specified in the schema. DO NOT create new or adjust column names.
    17. When filtering for a specific model, use a combination of EXISTS subquery and HAVING clause to ensure the model is included but not exclusive.

    SQL Query:
    """
)
print(text2sql_prompt.template)



    Given the following SQL tables:
    {schema}

    Generate a SQL query to answer the following question:
    {query_str}

    SQL Query Guidelines:
    1. ALWAYS include these output columns in this order: name, category, part_number, description, additional_details, quantity, price, year, make, model.
    2. Use LEFT JOINs to ensure all parts are included, even if they don't have associated makes, models, or years.
    3. Use GROUP_CONCAT with DISTINCT when aggregating model, make, and year values.
    4. JOINs should follow this pattern: parts -> part_make -> makes, parts -> part_models -> models, parts -> part_years -> years.
    5. Use direct queries and only use JOINs when needed.
    6. Prefer filtering directly on 'parts' when possible.
    7. Use literal values; no placeholders.
    8. Use single quotes (') for string literals.
    9. For percentage calculations: decrease by X% = price * (1 - X/100), increase by X% = price * (1 + X/100).
    10. Use LOWER() + LIKE '%exampl

In [14]:
from llama_index.core.workflow import (
    Workflow,
    StartEvent,
    StopEvent,
    step,
    Context,
    Event,
)

class TableRetrieveEvent(Event):
    """Result of running table retrieval."""
    table_context_str: str
    query: str

class TextToSQLEvent(Event):
    """Text-to-SQL event."""

    sql: str
    query: str

class TextToSQLWorkflow(Workflow):
    def __init__(
        self,
        obj_retriever,
        text2sql_prompt,
        llm,
        *args,
        **kwargs
    ) -> None:
        super().__init__(*args, **kwargs)
        self.obj_retriever = obj_retriever
        self.text2sql_prompt = text2sql_prompt
        self.llm = llm
        self.parse_response_to_sql = parse_response_to_sql

    @step
    def retrieve_tables(
        self, ctx: Context, ev: StartEvent
    ) -> TableRetrieveEvent:
        """Retrieve tables."""
        table_schema_objs = self.obj_retriever.retrieve(ev.query)
        table_context_str = get_table_context_str(table_schema_objs)
        return TableRetrieveEvent(
            table_context_str=table_context_str, query=ev.query
        )


    @step
    def generate_sql(
        self, ctx: Context, ev: TableRetrieveEvent
    ) -> TextToSQLEvent:
        """Generate SQL statement."""
        fmt_messages = self.text2sql_prompt.format_messages(
            query_str=ev.query, schema=ev.table_context_str
        )
        chat_response = self.llm.chat(fmt_messages)
        sql = parse_response_to_sql(chat_response)
        return TextToSQLEvent(sql=sql, query=ev.query)
    
    @step
    def return_sql(self, ctx: Context, ev: TextToSQLEvent) -> StopEvent:
        """Return the SQL statement"""
        return StopEvent(result=ev.sql)

# Workflow initialization and execution
workflow = TextToSQLWorkflow(
    obj_retriever,
    text2sql_prompt,
    llm,
    verbose=True,
)

user_query = "Identify all Pilot parts that have a months no sale greater than or equal to 12 and a price greater than 50 dollars and reduce the price by 85 percent"

# Ensure the workflow run is awaited properly
response = await workflow.run(query=user_query)
print(str(response))


    


Running step retrieve_tables
Step retrieve_tables produced event TableRetrieveEvent
Running step generate_sql
Step generate_sql produced event TextToSQLEvent
Running step return_sql
Step return_sql produced event StopEvent
Here is the SQL query that meets all the specified guidelines:

```sql
SELECT 
    p.name, 
    p.category, 
    p.part_number, 
    p.description, 
    p.additional_details, 
    p.quantity, 
    p.price * (1 - 85/100) AS price, 
    GROUP_CONCAT(DISTINCT y.year) AS year, 
    GROUP_CONCAT(DISTINCT m.name) AS make, 
    GROUP_CONCAT(DISTINCT mo.name) AS model
FROM 
    parts p
LEFT JOIN 
    part_make pm ON p.id = pm.part_id
LEFT JOIN 
    makes m ON pm.make_id = m.id
LEFT JOIN 
    part_models pm2 ON p.id = pm2.part_id
LEFT JOIN 
    models mo ON pm2.model_id = mo.id
LEFT JOIN 
    part_years py ON p.id = py.part_id
LEFT JOIN 
    years y ON py.year_id = y.id
WHERE 
    p.months_no_sale >= 12 
    AND p.price > 50 
    AND EXISTS (
        SELECT 1 
        FROM pa

In [15]:
import re
import re
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

def extract_sql_from_response(response):
    # If the response is already just SQL, return it as is
    if response.strip().upper().startswith("SELECT"):
        return response.strip().rstrip(';')
    
    # Otherwise, try to find SQL within the response
    match = re.search(r'(?:sql|SQL)?\s*(SELECT[\s\S]+?)\s*(?:;|$)(?:\n|$)', response, re.IGNORECASE)
    if match:
        return match.group(1).strip().rstrip(';')
    else:
        raise ValueError("No SQL query found in the response")
    
sql_query = extract_sql_from_response(str(response))

def run_sql_query(sql_query, engine):
    try:
        with engine.connect() as connection:
            result = connection.execute(text(sql_query))
            return result.fetchall()
    except SQLAlchemyError as e:
        print(f"An error occurred: {e}")
        return None
    
engine = create_engine(f'sqlite:///{db_path}')
results = run_sql_query(sql_query, engine)
result_df = pd.DataFrame(results)
result_df = result_df.rename(columns={"discounted_price":"price"})
result_df


Unnamed: 0,name,category,part_number,description,additional_details,quantity,price,year,make,model
0,BU SENSOR R561P - 08V67-TK8-1F0K,Honda,08V67-TK8-1F0K,bu sensor r561p,,1,232.80,20212022,Honda,"Insight,Pilot,Clarity"
1,BASE FR GRILLE - 71121-TG7-A11,Honda,71121-TG7-A11,base fr grille,,1,126.83,2020,Honda,"Passport,Odyssey,Pilot,HR-V,Accord"
2,COVER ASSY PARKI - 39681-TX4-A01ZJ,Honda,39681-TX4-A01ZJ,cover assy parki,,3,89.28,2011,Honda,"Passport,Pilot,Element,Ridgeline"
3,BLADE WSHLD WIPER - 76620-S9A-306,Honda,76620-S9A-306,blade wshld wiper,,2,68.52,2013,Honda,"Pilot,Ridgeline"
4,KNUCKLE COMP R RR - 52210-TZ6-A00,Honda,52210-TZ6-A00,knuckle comp r rr,,1,193.95,2010,Honda,"Fit,Pilot,Element"
...,...,...,...,...,...,...,...,...,...,...
83,ALTERNATOR (CSJ24 - 31100-RV0-305,Honda,31100-RV0-305,alternator (csj24,,2,1236.18,2019202020212015201620172018,Honda,Pilot
84,HANDLE ASSY R SL - 72640-TK8-A02,Honda,72640-TK8-A02,handle assy r sl,,1,63.41,2012,Honda,"Pilot,HR-V,Ridgeline"
85,CAP ASSY WHEEL CT - 44732-TR3-A01,Honda,44732-TR3-A01,cap assy wheel ct,,7,121.80,2012,Honda,"Fit,Pilot,Element,Ridgeline"
86,ODY LOCK - 903,Honda,903,ody lock,,4,152.00,2020,Honda,"Insight,Pilot,Accord,Crosstour"


In [10]:
test_inputs = [
    # Retained original queries for baseline testing
    ["Get all pilot parts valued over $100 and reduce their price by 50%",
     "Get all Honda pilot parts and reduce their price by 30%"],
    
    # Complex queries with multiple conditions and operations
    ["Find all Honda and Toyota parts compatible with models from 2018-2023, priced between $200 and $1000, that haven't sold in the last 6 months, then reduce their price by 15% and flag them for a special promotion"],
    ["Identify the top 5 most expensive parts for each car make, considering only parts compatible with models from the last 8 years, then calculate the average price of these top parts across all makes"],
    
    # Queries testing edge cases and specific functionalities
    ["Get all parts that fit exactly 3 different models, cost less than $75, have been in stock for more than 18 months, then increase their price by 5% for each model they fit"],
    ["Find parts compatible with every model year of Ford F-150 from 2010 to 2024, cost more than $500, and reduce their price by 2% for each month they haven't been sold, up to a maximum of 25% reduction"],
    
    # Queries involving complex calculations and comparisons
    ["Identify parts that are priced at least 50% above the average price for their category (based on the first word of the description) and haven't been sold in the last year, then reduce their prices to the category average"],
    ["Find all parts that fit both domestic (Ford, Chevrolet, Dodge) and foreign (Honda, Toyota, Nissan) vehicles, are priced above the median for their respective categories, and create a 'discount_price' column that's 80% of the original price for parts not sold in 9+ months"],
    
    # Queries testing the system's ability to handle unusual requests
    ["Get all parts where the part number contains both letters and numbers, the price is a prime number, and the description includes the word 'sensor', then increase the price by 7.5%"],
    ["Identify parts that are compatible with at least one model from every year between 2000 and 2024, cost over $150, and create a new 'popularity_score' column based on the number of compatible models and inverse of months_no_sale"],
    
    # Queries involving multiple joins and subqueries
    ["For each car make, find the model with the most compatible parts, then list the top 10 most expensive parts for that model, including parts that fit multiple models"],
    ["Identify parts that fit more than 5 different make-model combinations, then for those parts, calculate the price difference compared to the average price of parts fitting only one make-model combination"],
    
    # Queries testing error handling and edge cases
    ["Get all parts for Ferrari models made after 2025"],  # Testing for non-existent data
    ["List all parts with a negative price or negative months_no_sale"],  # Testing for data integrity
    ["Get all parts and divide their price by zero"],  # Testing for division by zero error handling
    
    # Queries with potential SQL injection attempts
    ["Get all Honda parts'; DROP TABLE parts; --"],
    ["Get all parts where part_number = '1234' OR '1'='1'"]
]

In [11]:
# Example querys
#user_query = "Get all my honda pilot parts for the years 2020-2024 that have a months no sale greater than or equal to 12 and are priced higher than 50 dollars and decrease their price by 80%"


In [16]:
import polars as pl
from openai import OpenAI
import os

def load_data(file_path):
    return pd.read_csv(file_path)

def get_part_info(df, part_number):
    part = df[df['part_number'] == part_number]
    if part.empty:
        return None
    return part.iloc[0].to_dict()

def generate_description(part_info, user_context, client):
    if part_info is None:
        return "Part not found in the database."

    prompt = f"""
    User Context: {user_context}
    Parts Information: {part_info}
    Generate a concise, technical description for the part using the following format:
    Part Number: [Insert part number]
    Name: [Insert part name]
    Application: [Insert make, model, and year range]
    Condition: [New/Used/Remanufactured]
    Technical Specs: [List key specifications]
    Fitment Details: [Provide specific fitment information]
    Notable Features: [Mention any unique features]
    
    Use industry-standard terminology, be specific about compatibility, and highlight key technical details. Limit to 100 words.

    ALWAYS leave a note for user to check correctness of fitment info
    """
    response = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18", 
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

def process_parts(file_path, user_context):
    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])  
    df = load_data(file_path)
    
    results = []
    for _, row in df.iterrows():
        part_info = row.to_dict()
        description = generate_description(part_info, user_context, client)
        part_info['description'] = description
        results.append(part_info)
    
    return pl.DataFrame(results)

def clean_description(text):
    return re.sub(r'\*\*([^*\n]+)\*\*', r'\1', text)

def post_process_parts_description(df):
    df = df.with_columns(
        pl.col('description').map_elements(clean_description, return_dtype=pl.Utf8),
    )
    return df


user_context = "Genuine OEM Honda parts that are brand new and in the box with original packaging. All parts are in new condition."
file_path = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/notebooks/test_results.csv"

result = process_parts(file_path, user_context)
result_df = post_process_parts_description(result)

KeyboardInterrupt: 

In [None]:
filename="/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/notebooks/test_results_w_desccription.csv"
result_df.write_csv(filename)