In [13]:
from sqlalchemy import inspect
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, Float
from sqlalchemy import text
import pandas as pd
import os
import openai
from sqlalchemy.orm import sessionmaker, relationship, declarative_base

In [14]:
os.environ["OPENAI_API_KEY"] = "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"
openai.api_key = os.environ["OPENAI_API_KEY"]

In [15]:
Base = declarative_base()

class Part(Base):
    __tablename__ = 'parts'
    # name = Column(String)
    # category = Column(String)
    part_number = Column(String, primary_key=True, unique=True)
    description = Column(String)
    # additional_details = Column(String)
    quantity = Column(Integer)
    price = Column(Float)
    make = Column(String)
    # allowed_categories = Column(String)

class Model(Base):
    __tablename__ = 'models'
    id = Column(Integer, primary_key=True, autoincrement=True)
    part_number = Column(String, ForeignKey('parts.part_number'))
    model_name = Column(String, unique=True)

class Year(Base):
    __tablename__ = 'years'
    id = Column(Integer, primary_key=True, autoincrement=True)
    part_number = Column(String, ForeignKey('parts.part_number'))
    year = Column(Integer, unique=True)

# model year junction table
class ModelYear(Base):
    __tablename__ = 'model_year'
    id = Column(Integer, primary_key=True, autoincrement=True)
    part_number = Column(Integer, ForeignKey('parts.part_number'))
    model_id = Column(Integer, ForeignKey('models.id'), nullable=True)
    year_id = Column(Integer, ForeignKey('years.id'), nullable=True)


In [16]:
db_path = r'C:\Users\vivia\co-pilot-v1\Notebooks\bulk_upload.db'
engine = create_engine(f'sqlite:///{db_path}')
csv_path = r"C:\Users\vivia\co-pilot-v1\Notebooks\alberta_honda_data_synthetic.csv"

Base.metadata.drop_all(engine)

Base.metadata.create_all(engine)

Session = sessionmaker(bind=engine)
session = Session()

parts_df = pd.read_csv(csv_path)
parts_df = parts_df.drop_duplicates(subset=['part_number'])

parts_data = parts_df.to_dict('records')

In [17]:
# making parts table
for part in parts_data:
    new_part = Part(
        part_number = part['part_number'],
        description = part.get('description'),
        quantity = part.get('quantity'),
        price = part.get('price'),
        make = part.get('make')    
    )
    session.add(new_part)
    session.flush()

session.commit()

In [18]:

import numpy as np

def get_unique_elements(col):
    col_data = np.array(parts_df[col])
    unique_col = set()

    for entry in col_data:
        if "," in entry:
            separate_entries = entry.split(", ")
            for element in separate_entries:
                unique_col.add(element)
        else:
            unique_col.add(entry)
    
    return unique_col

# getting unique model names
unique_models = get_unique_elements('model')

# insert models into the database
for model in unique_models:
    new_model = Model(model_name=model)
    session.add(new_model)
session.commit()

In [19]:
# getting unique years
unique_years = get_unique_elements('year')

# insert years into the database
for year in unique_years:
    new_year = Year(year=year)
    session.add(new_year)
session.commit()

In [20]:
# insert all models and years into model_year database
for row in parts_data:
    part_number = row['part_number']
    model_name = row.get('model')
    year_value = row.get('year')

    # splitting to get all possible models and years
    if "," in model_name:
        all_models = model_name.split(", ")
    else:
        all_models = [model_name]
    
    if "," in year_value:
        all_years = year_value.split(", ")
    else:
        all_years = [year_value]
    
    # get all possible combinations of models and years
    unique_model_years = []
    for model in all_models:
        for year in all_years:
            unique_model_years.append([model, year])
    
    # insert all models and years into database for a given part
    for model_year in unique_model_years:
        model = model_year[0]
        year = model_year[1]

        if model:
            model_id = session.query(Model.id).filter_by(model_name=model).first()
            model_id = model_id[0] if model_id else None
        else:
            model_id = None
        
        if year:
            year_id = session.query(Year.id).filter_by(year=year).first()
            year_id = year_id[0] if year_id else None
        else:
            year_id = None

        new_model_year = ModelYear(part_number=part_number, model_id=model_id, year_id=year_id)
        session.add(new_model_year)
    
    session.commit()
    session.close()

In [21]:
from sqlalchemy import create_engine, inspect

engine = create_engine(f'sqlite:///{db_path}')

# Create an inspector
inspector = inspect(engine)

# Get the list of table names
tables = inspector.get_table_names()

# Print the structure of each table
for table in tables:
    print(f"\nTable: {table}")
    columns = inspector.get_columns(table)
    for column in columns:
        print(f"  Column: {column['name']} - {column['type']}")

# Additionally, print foreign key information
for table in tables:
    print(f"\nForeign Keys for Table: {table}")
    foreign_keys = inspector.get_foreign_keys(table)
    for fk in foreign_keys:
        print(f"  Foreign Key: {fk['constrained_columns']} -> {fk['referred_table']}.{fk['referred_columns']}")



Table: model_year
  Column: id - INTEGER
  Column: part_number - INTEGER
  Column: model_id - INTEGER
  Column: year_id - INTEGER

Table: models
  Column: id - INTEGER
  Column: part_number - VARCHAR
  Column: model_name - VARCHAR

Table: parts
  Column: part_number - VARCHAR
  Column: description - VARCHAR
  Column: quantity - INTEGER
  Column: price - FLOAT
  Column: make - VARCHAR

Table: years
  Column: id - INTEGER
  Column: part_number - VARCHAR
  Column: year - INTEGER

Foreign Keys for Table: model_year
  Foreign Key: ['part_number'] -> parts.['part_number']
  Foreign Key: ['model_id'] -> models.['id']
  Foreign Key: ['year_id'] -> years.['id']

Foreign Keys for Table: models
  Foreign Key: ['part_number'] -> parts.['part_number']

Foreign Keys for Table: parts

Foreign Keys for Table: years
  Foreign Key: ['part_number'] -> parts.['part_number']


In [22]:

with engine.connect() as connection:
    query = text("""SELECT p.part_number, m.model_name, p.price, p.quantity, p.make, p.description, y.year
                    FROM model_year my
                    JOIN models m ON my.model_id=m.id
                    JOIN parts p ON my.part_number=p.part_number
                    JOIN years y ON my.year_id=y.id
                 """)
    df = connection.execute(query)
    
df = pd.DataFrame(df.fetchall(), columns=["part_number", "model_name", "price", "quantity", "make", "description", "year"])
df

Unnamed: 0,part_number,model_name,price,quantity,make,description,year
0,04301-RNA-307,HR-V,68.9920,1,Honda,P/S BELT,2020
1,04301-RNA-307,CR-Z,68.9920,1,Honda,P/S BELT,2020
2,04711-3A0-A00ZZ,Pilot,298.9375,2,Honda,FACE FR BUMPER,2016
3,04711-3A0-A00ZZ,Pilot,298.9375,2,Honda,FACE FR BUMPER,2013
4,04711-3A0-A00ZZ,Pilot,298.9375,2,Honda,FACE FR BUMPER,2018
...,...,...,...,...,...,...,...
5739,TUMBLER2,CR-V,28.6580,0,Honda,WHITE TUMBLER,2012
5740,UNDERCOAT,Clarity,61.6000,37,Honda,PROTECTION,2016
5741,UNDERCOAT,Fit,61.6000,37,Honda,PROTECTION,2016
5742,UNDERCOAT,Pilot,61.6000,37,Honda,PROTECTION,2016


In [12]:
from llama_index.core import SQLDatabase, VectorStoreIndex
from llama_index.core.objects import (
    SQLTableNodeMapping,
    ObjectIndex,
    SQLTableSchema,
)
from llama_index.core.indices.struct_store import SQLTableRetrieverQueryEngine
from llama_index.llms.openai import OpenAI

def setup_nlsql_query_engine():
    def initialize_table_objects():
        sql_database = SQLDatabase(engine, sample_rows_in_table_info=2, include_tables=['parts', 'model_year', 'models', 'years'])

        model_year_context = """The 'model_year' table provides model and year information for a given part number when combined with 'models' table and 'years' table. 
                                Includes columns 'id', 'part_number', 'model_id', and 'year_id'. """
        models_context = "The 'models' table provides the model name of a given model id. The model of a car is the specific product, not to be confused with the make or brand of the car. Combine with 'model_year' to get the corresponding model of a specific model id. Includes columns 'id', 'part_number', and 'model_name'."
        parts_context = """The 'parts' table provides detailed inventory data for individual parts. Use part-specific queries. Includes columns 'part_number', 'description', 'quantity', 'price', and 'make'.
                            The 'make' of the car is the brand, not to be confused with the model of the car. (e.g. Honda is the make, CR-V is the model).
                            Combine with 'model_year', 'models', 'years' to get all information on a given part number."""
        years_context = "The 'years' table provides the year of a given year id. Combine with 'model_year' to get the corresponding year of a specific year id. Includes columns 'id', 'part_number', and 'year'."

        context_strs = [model_year_context, models_context, parts_context, years_context]
        table_context_str = "The Table description is: "
        table_context_str += "\n\n The Table description is: ".join(context_strs)

        table_node_mapping = SQLTableNodeMapping(sql_database)
        table_schema_objs = [
            SQLTableSchema(table_name='model_year', context_str=model_year_context),
            SQLTableSchema(table_name='models', context_str=models_context),
            SQLTableSchema(table_name='parts', context_str=parts_context),
            SQLTableSchema(table_name='years', context_str=years_context),
        ]
        obj_index = ObjectIndex.from_objects(
            table_schema_objs,
            table_node_mapping,
            VectorStoreIndex,
        )
        return sql_database, table_schema_objs, obj_index, table_context_str

    sql_database, table_schema_objs, obj_index, table_context_str = initialize_table_objects()

    context_str = (
        "Use JOINs prefaced with table names for combining multiple tabls."
        "Convert percentages to decimals (e.g., '50%' as '0.5')."
        "Split up make and model to look them up separately if given both the make and model of a car in the user query."
        "Use LOWER and LIKE for searching brand, modelName or description because of case sensitivity."
        "Use correct column names when joining multiple tables."
        "Be sure to look up make and model separately in their corresponding columns."
        "Pay close attention to filtering criteria mentioned in the question and incorporate them using the WHERE clause in your SQL query."
        "If the question involves multiple conditions, use logical operators such as AND, OR to combine them effectively."
        "If the question involves grouping of data (e.g., finding totals or averages for different categories), use the GROUP BY clause along with appropriate aggregate functions."
        "Consider using aliases for tables and columns to improve readability of the query, especially in case of complex joins or subqueries."
        "If necessary, use subqueries or common tables expressions (CTEs) to break down the problem into smaller, more manageable parts."
        "Ensure detailed, relevant responses."
    ) 
    context_str_combined = context_str + "\n\n" + table_context_str

    query_engine = SQLTableRetrieverQueryEngine(
        sql_database=sql_database,
        table_retriever=obj_index.as_retriever(similarity_top_k=1),
        synthesize_response=True,
        llm=OpenAI(temperature=0.1, model="gpt-3.5-turbo-0125"),
        context_str_prefix=context_str_combined
    )

    return query_engine

query_engine = setup_nlsql_query_engine()

def process_user_input_to_sql(user_input):
    response = query_engine.query(user_input)
    # print(f"RESPONSE: {response.metadata}")
    sql_query = response.metadata.get('sql_query', '').replace('\n', ' ').replace('\r', ' ').strip()
    # print(f"SQL QUERY after adjustment: {sql_query}")
    if sql_query.startswith('sql'):
        sql_query = sql_query[3:].strip()
    print(f"SQL: {sql_query}")
    # write_to_log(str(sql_query))
    return sql_query

def query_output(user_input):
    sql_query = process_user_input_to_sql(user_input)
    # print(f"SQL QUERY Output: {sql_query}")

    with engine.connect() as connection:
        result = connection.execute(text(sql_query))
        result_data = result.fetchall()
        if len(result_data) >= 5:
            result_df = pd.DataFrame(result_data, columns=result.keys())
            return result_df
        else:
            response = query_engine.query(sql_query)
            return str(response)

user_input = "Get all Honda CR-Z parts"
response = query_output(user_input)
print(response)


TABLE CONTEXT STRING: The Table description is: The 'model_year' table provides model and year information for a given part number when combined with 'models' table and 'years' table. 
                                Includes columns 'id', 'part_number', 'model_id', and 'year_id'. 

 The Table description is: The 'models' table provides the model name of a given model id. The model of a car is the specific product. Combine with 'model_year' to get the corresponding model of a specific model id. Includes columns 'id', 'part_number', and 'name'.

 The Table description is: The 'parts' table provides detailed inventory data for individual parts. Use part-specific queries. Includes columns 'part_number', 'description', 'quantity', 'price', and 'make'.
                            The 'make' of the car is the brand, not to be confused with the model of the car. (e.g. Honda is the make, CR-V is the model).
                            Combine with 'model_year', 'models', 'years' to get all inf

In [15]:

test_inputs = [
    "Get all CR-Z parts and reduce their price by 50%",
    "Get all parts made between 2005 and 2015",
    "Get all Honda Pilot parts and reduce their price by 10%",
    "Get all parts that are more expensive than $300 and reduce their price by 20%"
]

for test in test_inputs:
    response = query_output(test)
    print(test)
    if isinstance(response, pd.DataFrame):
        response = response.to_string()
    print(response)
    print("**********************************************************************************************")

print("DONE WITH ALL TESTS")


SQL: SELECT p.part_number, p.description, p.quantity, p.price * 0.5 AS reduced_price, p.make FROM parts p WHERE p.description LIKE '%CR-Z%' ORDER BY p.price DESC;
Get all CR-Z parts and reduce their price by 50%
There are no parts in the database that match the description containing 'CR-Z'.
**********************************************************************************************
SQL: SELECT p.part_number, p.description, p.make FROM parts p JOIN model_year my ON p.part_number = my.part_number JOIN years y ON my.year_id = y.id WHERE y.year >= 2005 AND y.year <= 2015 ORDER BY p.make;
Get all parts made between 2005 and 2015
          part_number                                description   make
0     04711-3A0-A00ZZ                             FACE FR BUMPER  Honda
1     04711-3A0-A00ZZ                             FACE FR BUMPER  Honda
2     04711-3A0-A00ZZ                             FACE FR BUMPER  Honda
3     04711-3A0-A00ZZ                             FACE FR BUMPER  Honda
4    

In [26]:
user_sql_examples = [
    {
        "question": "Get all CR-Z parts and reduce their price by 50%",
        "sql_query": """SELECT p.part_number, p.description, p.quantity, p.price * 0.5 as reduced_price, p.make, m.model_name, y.year FROM parts p
                        JOIN model_year my ON p.part_number = my.part_number
                        JOIN models m ON my.model_id = m.id
                        JOIN years y on my.year_id = y.id
                        WHERE LOWER(m.model_name) LIKE '%cr-z%';"""
    },
    {
        "question": "Get all Honda CR-Z parts and reduce their price by 30%",
        "sql_query": """SELECT p.part_number, p.description, p.quantity, p.price * 0.7 as reduced_price, p.make, m.model_name, y.year FROM parts p
                        JOIN model_year my ON p.part_number = my.part_number
                        JOIN models m ON my.model_id = m.id
                        JOIN years y on my.year_id = y.id
                        WHERE LOWER(m.model_name) LIKE '%cr-z%';"""
    },
    {
        "question": "Get all Honda parts and reduce their price by 20%",
        "sql_query": """SELECT p.part_number, p.description, p.quantity, p.price * 0.8 as reduced_price, p.make, m.model_name, y.year FROM parts p
                        JOIN model_year my ON p.part_number = my.part_number
                        JOIN models m ON my.model_id = m.id
                        JOIN years y on my.year_id = y.id
                        WHERE LOWER(p.make) LIKE '%honda%';"""
    },
    {
        "question": "Get all Honda parts between 2005 and 2015 and reduce their price by 20%",
        "sql_query": """SELECT p.part_number, p.description, p.quantity, p.price * 0.8 as reduced_price, p.make, m.model_name, y.year FROM parts p
                        JOIN model_year my ON p.part_number = my.part_number
                        JOIN models m ON my.model_id = m.id
                        JOIN years y on my.year_id = y.id
                        WHERE LOWER(p.make) LIKE '%honda%'
                        AND y.year >= 2005 AND y.year <= 2015;"""
    },
    {
        "question": "Get all Pilot parts and reduce their price by 50%",
        "sql_query": """SELECT p.part_number, p.description, p.quantity, p.price * 0.5 as reduced_price, p.make, m.model_name, y.year FROM parts p
                        JOIN model_year my ON p.part_number = my.part_number
                        JOIN models m ON my.model_id = m.id
                        JOIN years y on my.year_id = y.id
                        WHERE LOWER(m.model_name) LIKE '%pilot%';"""
    },
    {
        "question": "Get all Honda Pilot parts and reduce their price by 20%",
        "sql_query": """SELECT p.part_number, p.description, p.quantity, p.price * 0.8 as reduced_price, p.make, m.model_name, y.year FROM parts p
                        JOIN model_year my ON p.part_number = my.part_number
                        JOIN models m ON my.model_id = m.id
                        JOIN years y on my.year_id = y.id
                        WHERE LOWER(m.model_name) LIKE '%pilot%';"""
    },
    {
        "question": "Get all Honda Pilot parts and reduce their price by 20%",
        "sql_query": """SELECT p.part_number, p.description, p.quantity, p.price * 0.8 as reduced_price, p.make, m.model_name, y.year FROM parts p
                        JOIN model_year my ON p.part_number = my.part_number
                        JOIN models m ON my.model_id = m.id
                        JOIN years y on my.year_id = y.id
                        WHERE LOWER(m.model_name) LIKE '%pilot%';"""
    },
    {
        "make": "Honda",
        "models": "Pilot, Accord, CR-V, CR-Z, Civic, Clarity, Crosstour, Element, Fit, HR-V, Insight, Odyssey, Passport, Pilot, Ridgeline, S2000"
    }

]

In [30]:
import json

base_path = r"C:\Users\vivia\co-pilot-v1\Notebooks\\"
out_file = open(base_path + "bulk_upload.json", "w")

json.dump(user_sql_examples, out_file, indent=6)

out_file.close()

In [31]:
from llama_index.readers.json import JSONReader

json_file = r"C:\Users\vivia\co-pilot-v1\Notebooks\bulk_upload.json"
reader = JSONReader()
json_documents = reader.load_data(input_file=json_file)