In [4]:
from sqlalchemy import inspect
from sqlalchemy import create_engine
from sqlalchemy import text
import pandas as pd


In [22]:

# Database Path
#db_file_path = r"C:\Users\vivia\co-pilot-v1\data\databases\partswise_island_moto.db"
db_file_path = r"C:\Users\vivia\co-pilot-v1\data\databases\parts_database.db"
engine = create_engine(f"sqlite:///{db_file_path}")

# Create an inspector
inspector = inspect(engine)

# Get the list of table names
tables = inspector.get_table_names()

print("number of tables:", len(tables))
# Print the structure of each table
for table in tables:
    print(f"\nTable: {table}")
    columns = inspector.get_columns(table)
    for column in columns:
        print(f"  Column: {column['name']} - {column['type']}")

# Additionally, print foreign key information
for table in tables:
    print(f"\nForeign Keys for Table: {table}")
    foreign_keys = inspector.get_foreign_keys(table)
    for fk in foreign_keys:
        print(f"  Foreign Key: {fk['constrained_columns']} -> {fk['referred_table']}.{fk['referred_columns']}")

all_tables = []

number of tables: 5

Table: model_year
  Column: id - INTEGER
  Column: part_number - VARCHAR
  Column: model_id - INTEGER
  Column: year_id - INTEGER

Table: models
  Column: id - INTEGER
  Column: model_name - VARCHAR

Table: parts
  Column: part_number - VARCHAR
  Column: price - FLOAT
  Column: quantity - INTEGER
  Column: brand - VARCHAR
  Column: description - VARCHAR

Table: photos
  Column: id - INTEGER
  Column: url - VARCHAR
  Column: part_number - INTEGER

Table: years
  Column: id - INTEGER
  Column: year - INTEGER

Foreign Keys for Table: model_year
  Foreign Key: ['part_number'] -> parts.['part_number']
  Foreign Key: ['model_id'] -> models.['id']
  Foreign Key: ['year_id'] -> years.['id']

Foreign Keys for Table: models

Foreign Keys for Table: parts

Foreign Keys for Table: photos
  Foreign Key: ['part_number'] -> parts.['part_number']

Foreign Keys for Table: years


In [156]:
from json import loads, dumps

# getting parts info: part number, model, price, quantity, brand, description, and year
with engine.connect() as connection:
    query = text("""SELECT p.part_number, m.model_name, p.price, p.quantity, p.brand, p.description, y.year
                    FROM model_year my
                    JOIN models m ON my.model_id=m.id
                    JOIN parts p ON my.part_number=p.part_number
                    JOIN years y ON my.year_id=y.id
                 """)
    parts_df = connection.execute(query)
    
parts_df = pd.DataFrame(parts_df.fetchall(), columns=['partNumber', 'modelName', 'price', 'quantity', 'brand', 'description', 'year'])
parts_df.to_csv(r'C:\Users\vivia\co-pilot-v1\Notebooks\parts.csv', index=False)
# parts_json = parts_df.to_json(orient="table")
# parsed_parts_json = loads(parts_json)

# parts_json_schema = parsed_parts_json['schema']
# parts_json_data = parsed_parts_json['data']

Use this first pandasai smart dataframe for now.
The llama index pandas query pipeline isn't as good, but I kept the code below in case we want to change it down the line.

In [7]:
import os
import openai

os.environ["OPENAI_API_KEY"] = "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"
openai.api_key = os.environ["OPENAI_API_KEY"]


In [None]:

from pandasai import SmartDataframe
from pandasai.llm.openai import OpenAI
from json import loads, dumps

# read in csv data and put into df
data = pd.read_csv(r'C:\Users\vivia\co-pilot-v1\Notebooks\parts.csv')

# initialize llm
llm = OpenAI(api_token="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")

# create smart df from data and llm
sdf = SmartDataframe(data, description="Table that includes a part number, brand, price, quantity, description and year of a car part", config={"llm":llm})

# get user query and response
rounds = 0
while True:
    if rounds == 0:
        message_to_user = "Describe how you would like to upload your data (e.g. 'get all bmw parts and reduce their price by 50%'): "
    else:
        message_to_user = "It looks like there were no results from that query. Would you like to try something else? Type 'exit' to stop: "
    user_query = input(message_to_user)
    if user_query.lower() == "exit":
        break
    # "get all BMW parts" works but "Get all parts from BMW" doesnt?
    response = sdf.chat(user_query, "dataframe") #number, dataframe, plot, string
    result = response.to_json(orient="table")
    response_json = loads(result)
    if response_json['data'] != []:
        break
    rounds += 1

The code below is for the other pandas query pipelines that I tried

In [159]:
import sqlite3
import csv

csv_path = r'C:\Users\vivia\co-pilot-v1\Notebooks\parts.csv'

conn = sqlite3.connect(r'C:\Users\vivia\co-pilot-v1\Notebooks\bulk_upload.db')

cur = conn.cursor()

with open(csv_path) as f:
    reader = csv.reader(f)
    data = list(reader)
    ['partNumber', 'modelName', 'price', 'quantity', 'brand', 'description', 'year']
cur.execute('''CREATE TABLE parts (
            partNumber VARCHAR,
            modelName VARCHAR,
            price INT,
            quantity INT,
            brand VARCHAR,
            description VARCHAR,
            year INT
                )''' )
for row in data:
    cur.execute("INSERT INTO parts (partNumber, modelName, price, quantity, brand, description, year) values (?, ?, ?, ?, ?, ?, ?)", row)

conn.commit()
conn.close()

# delete the database after?

In [5]:
bulk_upload_db = r'C:\Users\vivia\co-pilot-v1\Notebooks\bulk_upload.db'
engine = create_engine(f"sqlite:///{bulk_upload_db}")

In [21]:
from llama_index.core import SQLDatabase
from llama_index.core.objects import (
    SQLTableNodeMapping,
    ObjectIndex,
    SQLTableSchema,
)
from llama_index.core import VectorStoreIndex
from llama_index.core.indices.struct_store import SQLTableRetrieverQueryEngine
from llama_index.llms.openai import OpenAI

def setup_nlsql_query_engine():
    def initialize_table_objects():
        sql_database = SQLDatabase(engine, sample_rows_in_table_info=2, include_tables=['parts'])

        # model_year_context = "Provides model and year information for a given part number when combined with 'models' table and 'years' table."
        # models_context = "Provides the name of a given model id. Combine with 'models_years' to get the corresponding model of a specific model id."
        # parts_context = """Provides detailed inventory data for individual parts. Use part-specific queries.
        #                     Combine with 'models_years', 'models', 'years', 'photos' to get all information on a given part number."""
        # photos_context = "Provides photos of a given part. Combine with 'parts' to connect with all other information of a part number."
        # years_context = "Provides the year of a given year id. Combine with 'models_years' to get the corresponding year of a specific year id."
        parts_context = "Provides detailed inventory data for individual parts, including the model name, price, quantity, brand, year, and a description of the part."

        # context_strs = [parts_context]
        table_context_str = "The Table description is: " + parts_context
        # table_context_str += "\n\n The Table descriptions is: ".join(context_strs)

        table_node_mapping = SQLTableNodeMapping(sql_database)
        table_schema_objs = [
            # SQLTableSchema(table_name='model_year', context_str=model_year_context),
            # SQLTableSchema(table_name='models', context_str=models_context),
            SQLTableSchema(table_name='parts', context_str=parts_context),
            # SQLTableSchema(table_name='photos', context_str=photos_context),
            # SQLTableSchema(table_name='years', context_str=years_context),
        ]
        obj_index = ObjectIndex.from_objects(
            table_schema_objs,
            table_node_mapping,
            VectorStoreIndex,
        )
        return sql_database, table_schema_objs, obj_index, table_context_str

    sql_database, table_schema_objs, obj_index, table_context_str = initialize_table_objects()

    context_str = (
        "Use JOINs prefaced with table names for combining multiple tables."
        "Convert percentages to decimals (e.g., '50%' as '0.5')."
        "Pay close attention to filtering criteria mentioned in the question and incorporate them using the WHERE clause in your SQL query."
        "If the question involves multiple conditions, use logical operators such as AND, OR to combine them effectively."
        "If the question involves grouping of data (e.g., finding totals or averages for different categories), use the GROUP BY clause along with appropriate aggregate functions."
        "Consider using aliases for tables and columns to improve readability of the query, especially in case of complex joins or subqueries."
        "If necessary, use subqueries or common tables expressions (CTEs) to break down the problem into smaller, more manageable parts."
        "Ensure detailed, relevant responses."
        "When selecting brand, modelName use LOWER to make searches case insensitive."

    ) 
    context_str_combined = context_str + "\n\n" + table_context_str

    query_engine = SQLTableRetrieverQueryEngine(
        sql_database=sql_database,
        table_retriever=obj_index.as_retriever(similarity_top_k=1),
        synthesize_response=True,
        llm=OpenAI(temperature=0.1, model="gpt-3.5-turbo-0125"),
        context_str_prefix=context_str_combined
    )

    return query_engine

query_engine = setup_nlsql_query_engine()

def process_user_input_to_sql(user_input):
    response = query_engine.query(user_input)
    print(f"RESPONSE: {response.metadata}")
    sql_query = response.metadata.get('sql_query', '').replace('\n', ' ').replace('\r', ' ').strip()
    print(f"SQL QUERY after adjustment: {sql_query}")
    if sql_query.startswith('sql'):
        sql_query = sql_query[3:].strip()
    print(f"SQL: {sql_query}")
    return sql_query

def query_output(user_input):
    sql_query = process_user_input_to_sql(user_input)
    print(f"SQL QUERY Output: {sql_query}")

    with engine.connect() as connection:
        result = connection.execute(text(sql_query))
        result_data = result.fetchall()
        if len(result_data) >= 5:
            result_df = pd.DataFrame(result_data, columns=result.keys())
            return result_df
        else:
            response = query_engine.query(sql_query)
            return str(response)

user_input = "Get all BMW parts and reduce their price by 50%"
response = query_output(user_input)
print(response)

RESPONSE: {'cb881491-cd2b-4a9e-af9a-9e5f1f345a8c': {'sql_query': "SELECT partNumber, modelName, price * 0.5 AS reduced_price, brand\nFROM parts\nWHERE LOWER(brand) = 'bmw'\nORDER BY reduced_price DESC;", 'result': [('18518558278', 'K1600 Bagger', 1183.25, 'Bmw'), ('18518546594', 'K1600 Bagger', 1119.2900390625, 'Bmw'), ('18518558277', 'K1600 8 Variants Between  And .', 1100.0, 'Bmw'), ('18518558277', 'K1600 8 Variants Between  And .', 1100.0, 'Bmw'), ('18518530582', '  R1200 Gs 17', 833.9849853515625, 'Bmw'), ('18518530582', 'R1200 Gs', 833.9849853515625, 'Bmw'), ('18518530582', 'R1200 Gs', 833.9849853515625, 'Bmw'), ('18518530582', '  R1200 Gs Adventure', 833.9849853515625, 'Bmw'), ('18518530582', 'R1200 Gs', 833.9849853515625, 'Bmw'), ('18518530582', 'R1200 Gs', 833.9849853515625, 'Bmw'), ('18518530582', '  R1200 Gs 17', 833.9849853515625, 'Bmw'), ('18518530582', '  R1200 Gs 17', 833.9849853515625, 'Bmw'), ('18518530582', '  R1200 Gs 17', 833.9849853515625, 'Bmw'), ('18518530582', ' 