In [1]:
from sqlalchemy import create_engine, inspect, text
from sqlalchemy.orm import sessionmaker, scoped_session

db_file_path = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/databases/partswise_database.db"
engine = create_engine(f'sqlite:///{db_file_path}')
Session = scoped_session(sessionmaker(bind=engine))

# Create an inspector
inspector = inspect(engine)

# Get list of table names
table_names = inspector.get_table_names()

print("Tables in the database:")
for table_name in table_names:
    print(f"\nTable: {table_name}")
    
    # Get column information
    columns = inspector.get_columns(table_name)
    print("Columns:")
    for column in columns:
        print(f"  - {column['name']} (Type: {column['type']})")
    
    # Get primary key information
    pk = inspector.get_pk_constraint(table_name)
    if pk['constrained_columns']:
        print(f"Primary Key: {', '.join(pk['constrained_columns'])}")
    
    # Get foreign key information
    foreign_keys = inspector.get_foreign_keys(table_name)
    if foreign_keys:
        print("Foreign Keys:")
        for fk in foreign_keys:
            print(f"  - {fk['constrained_columns']} -> {fk['referred_table']}.{fk['referred_columns']}")

# Don't forget to close the session when you're done
Session.remove()



Tables in the database:

Table: inventory
Columns:
  - id (Type: INTEGER)
  - part_number (Type: VARCHAR(15))
  - quantity (Type: INTEGER)
  - safety_stock (Type: INTEGER)
  - reorder_point (Type: INTEGER)
  - part_status (Type: VARCHAR(8))
  - negative_on_hand (Type: INTEGER)
  - last_updated (Type: DATETIME)
Primary Key: id
Foreign Keys:
  - ['part_number'] -> parts.['part_number']

Table: part_metrics
Columns:
  - id (Type: INTEGER)
  - part_number (Type: VARCHAR(15))
  - date (Type: DATE)
  - roi (Type: FLOAT)
  - demand (Type: FLOAT)
  - obsolescence_risk (Type: FLOAT)
  - sales_volatility (Type: FLOAT)
  - sales_trend (Type: FLOAT)
  - recent_sales_trend (Type: FLOAT)
  - days_of_inventory_outstanding (Type: INTEGER)
  - sell_through_rate (Type: FLOAT)
  - order_to_sales_ratio (Type: FLOAT)
  - seasonality_strength (Type: FLOAT)
  - consistency (Type: FLOAT)
  - anomaly_score (Type: FLOAT)
  - recurrence_score (Type: FLOAT)
  - last_updated (Type: DATETIME)
Primary Key: id
Foreig

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [3]:
import re
class QueryProcessor:
    def __init__(self):
        self.sql_keywords = [
            'SELECT', 'DROP', 'INSERT', 'DELETE', 'UPDATE', 'CREATE', 
            'ALTER', 'EXEC', 'UNION', 'DECLARE'
        ]

    def remove_sql_keywords(self, text):
        pattern = r'\b(' + '|'.join(self.sql_keywords) + r')\b'
        return re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    def normalize_whitespace_and_characters(self, text):
        text = re.sub(r'[\n\r\t]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r"[^\w\s%.,()_+-/*=<>'`]", '', text)  # Added ' and `
        return text.strip()

    def remove_sql_prefix(self, text):
        return re.sub(r'^sql\s*', '', text, flags=re.IGNORECASE)

    def preprocess_query(self, user_input):
        user_input = user_input.strip()
        user_input = self.remove_sql_prefix(user_input)  # Remove 'sql' prefix first
        user_input = re.sub(r'(\d+)%', lambda match: str(float(match.group(1)) / 100), user_input)
        user_input = self.normalize_whitespace_and_characters(user_input)
        user_input = self.remove_sql_keywords(user_input)
        return user_input
    
    def postprocess_sql_query(self, sql_query):
        sql_query = self.remove_sql_prefix(sql_query)
        sql_query = self.normalize_whitespace_and_characters(sql_query)
        return sql_query


In [4]:
import os
import pandas as pd
import openai
from sqlalchemy import create_engine, text
from functools import lru_cache
from llama_index.core import (
    SQLDatabase,
    VectorStoreIndex
)
from llama_index.core.tools import ToolMetadata, QueryEngineTool
from llama_index.core.objects import SQLTableSchema, SQLTableNodeMapping, ObjectIndex
from llama_index.core.query_engine import SQLTableRetrieverQueryEngine
from llama_index.llms.openai import OpenAI

class SQLQueryTool:
    def __init__(self, engine, name, description):
        print("Initializing SQLQueryTool...")
        self.engine = engine
        self.name = name
        self.description = description
        self.query_processor = QueryProcessor()
        self.sql_database, self.table_schema_objs, self.obj_index = self._initialize_table_objects()
        self.context_str_combined = self._create_context_str()  
        self.query_engine = self._create_query_engine()
        openai.api_key = os.environ["OPENAI_API_KEY"]
        print("Initialization complete.")

    def _initialize_table_objects(self):
        sql_database = SQLDatabase(self.engine, sample_rows_in_table_info=2, include_tables=[
            'sales', 'parts', 'inventory', 'part_metrics', 'temporal_metrics', 'supplier_orders'
        ])
        table_contexts = {
            'sales': "Records sales data for parts, including the part number, month (stored as a string), year, and quantity sold. Use this table for time-based sales data and sales trend analysis.",
            'parts': "Contains detailed information about each part, including the part number, description, supplier name (brand), price, and cost per unit. Use this table for detailed part information.",
            'inventory': "Tracks the inventory levels of parts, including the part number, quantity on hand, safety stock levels, reorder points, part status, and other related metrics. Use this table to monitor inventory levels and stock status.",
            'part_metrics': "Provides various performance metrics for parts, including ROI, demand, obsolescence risk, days of inventory outstanding, sell-through rate, and order-to-sales ratio. Use this table to analyze part performance and risk.",
            'temporal_metrics': "Contains time-based performance metrics for parts, including months with no sales, sales volatility, sales trend, recent sales trend, days of supply (for different periods), and turnover rates. Use this table for temporal analysis of part performance.",
            'supplier_orders': "Tracks orders placed with suppliers, including the part number, quantity ordered year-to-date, and whether the order was a special order. Use this table to monitor supplier orders and order patterns.",
        }

        table_schema_objs = [SQLTableSchema(table_name=name, context_str=context) for name, context in table_contexts.items()]
        table_node_mapping = SQLTableNodeMapping(sql_database)

        obj_index = ObjectIndex.from_objects(
            table_schema_objs,
            table_node_mapping,
            VectorStoreIndex,
        )
        return sql_database, table_schema_objs, obj_index
    
    @lru_cache()
    def _create_context_str(self):
        context_str = (
            "Ensure detailed, relevant responses and ALWAYS include numerical figures wherever possible. "
            "Access 'supplier_name' flexibly e.g., ('%bmw%'). "
            "Use JOINs prefaced with table names for combining multiple tables. "
            "Order months chronologically and use abbreviations for months (e.g., Jan, Feb, ..., Dec) in query results. Months are stored as strings."
        )
        table_context_str = self._get_table_context_str()
        return context_str + "\n\n" + table_context_str
    
    @lru_cache()
    def _get_table_context_str(self):
        context_strs = []
        for table_schema_obj in self.table_schema_objs:
            table_info = self.sql_database.get_single_table_info(table_schema_obj.table_name)
            if table_schema_obj.context_str:
                table_info += f" The table description is: {table_schema_obj.context_str}"
            context_strs.append(table_info)
        return "\n\n".join(context_strs)

    def _create_query_engine(self):
        return SQLTableRetrieverQueryEngine(
            sql_database=self.sql_database,
            table_retriever=self.obj_index.as_retriever(similarity_top_k=1),
            synthesize_response=True,
            llm=OpenAI(temperature=0.1, model="gpt-4o-mini-2024-07-18"),
            context_str_prefix=self.context_str_combined
        )
    
    def query(self, user_input, return_sql=False):
        preprocessed_input = self.query_processor.preprocess_query(user_input)
        response = self.query_engine.query(preprocessed_input)
        if return_sql:
            return response.metadata.get('sql_query', '')
        return response
    
    @lru_cache()
    def query_text(self, user_input):
        """ Handle textual queries """
        sql_query = self.query(user_input, return_sql=True)
        original_sql_query = sql_query  # Store the original query
        sql_query = self.query_processor.postprocess_sql_query(sql_query)
        print(f"Generated SQL query: {sql_query}")
        try:
            result_data, columns = self.execute_sql_query(sql_query)
            print(f'Results Data: {result_data}')
            if len(result_data) > 5:
                return pd.DataFrame(result_data, columns=columns)
            else:

                interpretation_prompt = f"""
                The following SQL query was executed:
                {original_sql_query}

                It returned the following results:
                {result_data}

                Provide a very concise one-sentence answer to the question: "{user_input}"
                Focus only on the key facts and figures. Do not explain the query or methodology.
                """
                interpretation = self.query_engine.query(interpretation_prompt)
                return interpretation.response
        except Exception as e:
            print(f"Error executing SQL query: {e}")
            # If there's an error, we'll fall back to asking the LLM to explain
            error_prompt = f"""
            The following SQL query was generated in response to the question "{user_input}":
            {original_sql_query}

            However, there was an error executing this query. Can you explain what the query is trying to do,
            and suggest any potential issues with the query or the database schema that might have caused this error?
            """
            explanation = self.query_engine.query(error_prompt)
            return explanation.response  # Direct LLM query if not enough data for a dataframe
    
    def execute_sql_query(self, sql_query):
        """ Execute an SQL query and return the results and column names """
        with self.engine.connect() as connection:
            result = connection.execute(text(sql_query))
            result_data = result.fetchall()
            columns = result.keys()
        return result_data, columns

    def query_output(self, user_input):
        """ Determine the intent and dispatch to the appropriate handler """
        return self.query_text(user_input)

    def _create_tool(self):
        return QueryEngineTool(
            query_engine=self.query_engine,
            metadata=ToolMetadata(
                name=self.name,
                description=self.description
            )
        )

db_file_path = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/databases/partswise_database.db"
engine = create_engine(f'sqlite:///{db_file_path}')
sql_tool_instance = SQLQueryTool(
    engine=engine, 
    name="SQL Query Tool",
    description="Tool for in-depth analysis of parts department performance and inventory management. Provide actionable data-driven insights either in tabular format or in targetted sentance format."
)

# Example query
result = sql_tool_instance.query_output("What is the current inventory level of each obsolete part with a price over $100?")
print(f'Query results: {result}')



Initializing SQLQueryTool...


python(58187) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Initialization complete.
Generated SQL query: SELECT p.part_number, p.description, i.quantity FROM parts p JOIN inventory i ON p.part_number = i.part_number WHERE p.part_status = 'obsolete' AND p.price > 100
Error executing SQL query: (sqlite3.OperationalError) no such column: p.part_status
[SQL: SELECT p.part_number, p.description, i.quantity FROM parts p JOIN inventory i ON p.part_number = i.part_number WHERE p.part_status = 'obsolete' AND p.price > 100]
(Background on this error at: https://sqlalche.me/e/20/e3q8)
Query results: The SQL query is designed to retrieve the current inventory levels of obsolete parts that have a price greater than $100. It does this by selecting the part number, description, and quantity from two tables: `parts` and `inventory`. The query joins these tables on the `part_number` field, filtering the results to include only those parts that are marked as 'obsolete' and have a price exceeding $100.

However, the error message indicates that there is an issue

In [20]:
import os
import pandas as pd
from sqlalchemy import create_engine

# Path to the database
db_file_path = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/databases/partswise_database.db"
engine = create_engine(f'sqlite:///{db_file_path}')

# Initialize the SQLQueryTool
sql_tool_instance = SQLQueryTool(
    engine=engine, 
    name="SQL Query Tool",
    description="Tool for in-depth analysis of parts department performance and inventory management"
)

# List of test queries
test_queries = [
    "What is the average monthly gross profit for the current year?",
    "What is the current inventory level of each obsolete part with a price over $100?",
    "What are the safety stock levels for each essential part?",
    "What is the ROI for each part for the current year?",
    "Which parts have the highest obsolescence risk?",
    "What is the turnover rate for each part in the last three months?",
    "How many parts have been ordered from each supplier year-to-date?",
    "Which suppliers have the highest special orders?",
    "Which parts had no sales in the last six months?",
    "What is the recent sales trend for each part?",
    "What is the sales volatility for each part in the last year?",
    "What are all my parts that are most at risk of becoming obsolete but are not obsolete yet?",
    "Which brand has the most active parts?",
    "What are the top 5 brands by most obsolescence in terms of total cost?",
    "What parts have had the most positive recent sales trend?",
    "Which parts have had a negative recent sales trend?",
    "What parts have a negative roi but have had more than 10 sales this year?",
    "What parts likely experience a stockout this year based on the sales trend and number of sales across months?"
]

# Function to run test queries and print results
def run_test_queries(sql_tool_instance, queries):
    for query in queries:
        print(f"Query: {query}")
        result = sql_tool_instance.query_output(query)
        if result is not None:
            print(result)
        else:
            print("No data returned or error encountered.")
        print(f"Generated SQL: {result}")
        print("-" * 80)

# Run the test queries
run_test_queries(sql_tool_instance, test_queries)

Initializing SQLQueryTool...


2024-07-31 20:56:54,980 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Initialization complete.
Query: What is the average monthly gross profit for the current year?


2024-07-31 20:56:55,492 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-31 20:56:55,509 - INFO - > Table desc str: Ensure detailed, relevant responses and ALWAYS include numerical figures wherever possible. Access 'supplier_name' flexibly e.g., ('%bmw%'). Use JOINs prefaced with table names for combining multiple tables. Order months chronologically and use abbreviations for months (e.g., Jan, Feb, ..., Dec) in query results. Months are stored as strings.

Table 'sales' has columns: id (INTEGER), part_number (VARCHAR(15)), month (VARCHAR(3)), year (INTEGER), quantity_sold (INTEGER), last_updated (DATETIME), and foreign keys: ['part_number'] -> parts.['part_number']. The table description is: Records sales data for parts, including the part number, month (stored as a string), year, and quantity sold. Use this table for time-based sales data and sales trend analysis.

Table 'parts' has columns: part_number (VARCHAR(15)), description (VARCHAR(255

Generated SQL query: SELECT strftime('%Y-%m', sales.year  '-'  sales.month  '-01') AS month_year, AVG((parts.price - parts.cost_per_unit) * sales.quantity_sold) AS average_gross_profit FROM sales JOIN parts ON sales.part_number = parts.part_number WHERE sales.year = strftime('%Y', 'now') GROUP BY month_year ORDER BY month_year
Error executing SQL query: (sqlite3.OperationalError) near "'-'": syntax error
[SQL: SELECT strftime('%Y-%m', sales.year  '-'  sales.month  '-01') AS month_year, AVG((parts.price - parts.cost_per_unit) * sales.quantity_sold) AS average_gross_profit FROM sales JOIN parts ON sales.part_number = parts.part_number WHERE sales.year = strftime('%Y', 'now') GROUP BY month_year ORDER BY month_year]
(Background on this error at: https://sqlalche.me/e/20/e3q8)


2024-07-31 20:57:06,503 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-31 20:57:06,548 - INFO - > Table desc str: Ensure detailed, relevant responses and ALWAYS include numerical figures wherever possible. Access 'supplier_name' flexibly e.g., ('%bmw%'). Use JOINs prefaced with table names for combining multiple tables. Order months chronologically and use abbreviations for months (e.g., Jan, Feb, ..., Dec) in query results. Months are stored as strings.

Table 'sales' has columns: id (INTEGER), part_number (VARCHAR(15)), month (VARCHAR(3)), year (INTEGER), quantity_sold (INTEGER), last_updated (DATETIME), and foreign keys: ['part_number'] -> parts.['part_number']. The table description is: Records sales data for parts, including the part number, month (stored as a string), year, and quantity sold. Use this table for time-based sales data and sales trend analysis.

Table 'parts' has columns: part_number (VARCHAR(15)), description (VARCHAR(255

The SQL query is designed to calculate the average monthly gross profit for the current year by performing the following steps:

1. **Date Formatting**: It uses the `strftime` function to format the year and month from the `sales` table into a `month_year` string (e.g., '2023-01' for January 2023).
2. **Gross Profit Calculation**: It calculates the gross profit for each sale by subtracting the `cost_per_unit` from the `price` of the parts and multiplying by the `quantity_sold`.
3. **Aggregation**: The query groups the results by `month_year` and computes the average gross profit for each month.
4. **Filtering**: It filters the results to include only sales from the current year, as determined by `strftime('%Y', 'now')`.

### Potential Issues:
1. **Data Type Mismatch**: If the `year` or `month` columns in the `sales` table are not stored as integers or strings in a compatible format, the concatenation and formatting may fail.
2. **Null Values**: If there are any null values in `parts.pr

2024-07-31 20:57:22,498 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-31 20:57:22,530 - INFO - > Table desc str: Ensure detailed, relevant responses and ALWAYS include numerical figures wherever possible. Access 'supplier_name' flexibly e.g., ('%bmw%'). Use JOINs prefaced with table names for combining multiple tables. Order months chronologically and use abbreviations for months (e.g., Jan, Feb, ..., Dec) in query results. Months are stored as strings.

Table 'sales' has columns: id (INTEGER), part_number (VARCHAR(15)), month (VARCHAR(3)), year (INTEGER), quantity_sold (INTEGER), last_updated (DATETIME), and foreign keys: ['part_number'] -> parts.['part_number']. The table description is: Records sales data for parts, including the part number, month (stored as a string), year, and quantity sold. Use this table for time-based sales data and sales trend analysis.

Table 'parts' has columns: part_number (VARCHAR(15)), description (VARCHAR(255

Generated SQL query: SELECT p.part_number, p.description, i.quantity FROM parts p JOIN inventory i ON p.part_number = i.part_number WHERE p.part_status = 'obsolete' AND p.price > 100
Error executing SQL query: (sqlite3.OperationalError) no such column: p.part_status
[SQL: SELECT p.part_number, p.description, i.quantity FROM parts p JOIN inventory i ON p.part_number = i.part_number WHERE p.part_status = 'obsolete' AND p.price > 100]
(Background on this error at: https://sqlalche.me/e/20/e3q8)


2024-07-31 20:57:26,974 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-31 20:57:26,983 - INFO - > Table desc str: Ensure detailed, relevant responses and ALWAYS include numerical figures wherever possible. Access 'supplier_name' flexibly e.g., ('%bmw%'). Use JOINs prefaced with table names for combining multiple tables. Order months chronologically and use abbreviations for months (e.g., Jan, Feb, ..., Dec) in query results. Months are stored as strings.

Table 'sales' has columns: id (INTEGER), part_number (VARCHAR(15)), month (VARCHAR(3)), year (INTEGER), quantity_sold (INTEGER), last_updated (DATETIME), and foreign keys: ['part_number'] -> parts.['part_number']. The table description is: Records sales data for parts, including the part number, month (stored as a string), year, and quantity sold. Use this table for time-based sales data and sales trend analysis.

Table 'parts' has columns: part_number (VARCHAR(15)), description (VARCHAR(255

The SQL query is designed to retrieve the current inventory levels of obsolete parts that have a price greater than $100. It does this by selecting the part number, description, and quantity from two tables: `parts` and `inventory`. The query joins these tables on the `part_number` field, filtering the results to include only those parts that are marked as 'obsolete' and have a price exceeding $100.

However, the error message indicates that there is an issue with the SQL statement, which could stem from several potential problems:

1. **Table or Column Names**: The tables `parts` and `inventory`, or the columns `part_number`, `description`, `quantity`, `part_status`, and `price` may not exist in the database schema as specified. It's important to verify that these names are correct and match the actual schema.

2. **Data Types**: If the `price` column is not of a numeric type, the comparison `p.price > 100` could cause an error. Ensure that the `price` column is defined with a compati

2024-07-31 20:57:38,925 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-31 20:57:38,935 - INFO - > Table desc str: Ensure detailed, relevant responses and ALWAYS include numerical figures wherever possible. Access 'supplier_name' flexibly e.g., ('%bmw%'). Use JOINs prefaced with table names for combining multiple tables. Order months chronologically and use abbreviations for months (e.g., Jan, Feb, ..., Dec) in query results. Months are stored as strings.

Table 'sales' has columns: id (INTEGER), part_number (VARCHAR(15)), month (VARCHAR(3)), year (INTEGER), quantity_sold (INTEGER), last_updated (DATETIME), and foreign keys: ['part_number'] -> parts.['part_number']. The table description is: Records sales data for parts, including the part number, month (stored as a string), year, and quantity sold. Use this table for time-based sales data and sales trend analysis.

Table 'parts' has columns: part_number (VARCHAR(15)), description (VARCHAR(255

Generated SQL query: SELECT p.part_number, p.description, i.quantity, i.reorder_point FROM inventory i JOIN parts p ON i.part_number = p.part_number WHERE i.quantity < i.reorder_point ORDER BY i.quantity ASC
Results Data: [('11-282s', 'stick on wheel weights 1/4oz', 0, 53), ('11338389397', 'rocker arm', 0, 1), ('11348530175', 'valve collet', 0, 1), ('153209', 'oil 4t sae 10w40 str synt 4l l', 0, 1), ('153216', 'oil 4t sae 10w40 str synt 1l l', 0, 6), ('153229', 'oil 4t sae 15w50 str synt 1l l', 0, 5), ('153307-l', 'oil,4t,sae,15w50,full,syn', 0, 131), ('1550', 'bulk 1550', 0, 77), ('3601-0605-l', 'motorex,4t,15w50', 0, 59), ('3605-0039', '(cs/12)c2 chain lube road 400m', 0, 2), ('3703-0006', 'brake fluid dot 4 250ml', 0, 53), ('3703-0038', 'brake fluid 3 & 4 500ml motul', 0, 1), ('3705-0014', '(cs/12) motocool expert -37 1', 0, 2), ('3706-0042', '(case)star tron fuel add 1oz-', 0, 2), ('3807-0352-s', 'drag optimate cable o1,each', 0, 23), ('3850-0442', 'brush-chain', 0, 1), ('550047150

2024-07-31 20:57:49,638 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-31 20:57:49,643 - INFO - > Table desc str: Ensure detailed, relevant responses and ALWAYS include numerical figures wherever possible. Access 'supplier_name' flexibly e.g., ('%bmw%'). Use JOINs prefaced with table names for combining multiple tables. Order months chronologically and use abbreviations for months (e.g., Jan, Feb, ..., Dec) in query results. Months are stored as strings.

Table 'sales' has columns: id (INTEGER), part_number (VARCHAR(15)), month (VARCHAR(3)), year (INTEGER), quantity_sold (INTEGER), last_updated (DATETIME), and foreign keys: ['part_number'] -> parts.['part_number']. The table description is: Records sales data for parts, including the part number, month (stored as a string), year, and quantity sold. Use this table for time-based sales data and sales trend analysis.

Table 'parts' has columns: part_number (VARCHAR(15)), description (VARCHAR(255

KeyboardInterrupt: 

In [29]:
import sqlite3
import pandas as pd

db_file_path = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/databases/partswise_database.db"

# Connect to the database
conn = sqlite3.connect(db_file_path)

# Define the query
query = """
    SELECT p.part_number, p.description, i.quantity, p.price, i.part_status FROM parts p JOIN inventory i ON p.part_number = i.part_number WHERE i.part_status = 'obsolete' AND p.price > 1000
"""

# Execute the query and fetch the results
try:
    df = pd.read_sql_query(query, conn)
    print(df)
except Exception as e:
    print(f"Error executing query: {str(e)}")
finally:
    conn.close()


     part_number                     description  quantity    price  \
0    1025320-800         asm-fender,frt,onyx,blk         0  1254.99   
1       t2506393                 instruments, km         0  1113.99   
2    23001541162              transmission cover         0  1390.99   
3       a9680022           silencer assembly, us         0  1559.99   
4           ss50                    super sevina         0  1840.00   
..           ...                             ...       ...      ...   
205   4169057360            c5 master blue 61 xl         2  1099.99   
206     a9600617          v&h silencer pair, blk         3  1090.99   
207    1812-0406  14-22 indian slimline duals hd         3  1744.99   
208  83212365962       motor oil advantec 15w-50         4  3864.99   
209  2879768-266  kit-exhaust,stage 1,50,csr,blk         4  1024.99   

    part_status  
0      obsolete  
1      obsolete  
2      obsolete  
3      obsolete  
4      obsolete  
..          ...  
205    obsolete  
206

In [5]:
import plotly.graph_objects as go
import logging
import plotly.graph_objs as go
from llama_index.core import PromptTemplate
from llama_index.core.tools import FunctionTool
import logging

class VisualQueryTool(SQLQueryTool):
    def query(self, user_input):
        result, sql_query = super().query(user_input)
        
        if result is None:
            return {
                'text_response': sql_query,
                'visual': None
            }

        chart_details = self.extract_chart_details(user_input)
        visual = self.generate_visual(result, chart_details)
        
        return {
            'text_response': result.to_string(),
            'visual': visual
        }

    def extract_chart_details(self, user_input):
        text_qa_template_str = (
            "Given the query: '{query_str}', extract the following details, adding hover data to enhance the utility of the charts:\n"
            "1. Chart type (e.g., bar, line, pie, etc.)\n"
            "2. Title for the chart\n"
            "3. X-axis label\n"
            "4. Y-axis label\n"
            "5. Additional columns for hover data\n"
            "\n"
            "Provide the details in the following format:\n"
            "Chart type: <type>\n"
            "Title: <title>\n"
            "X-axis label: <label>\n"
            "Y-axis label: <label>\n"
            "Hover data: <column1>, <column2>, ...\n"
        )
        text_qa_template = PromptTemplate(text_qa_template_str)
        
        prompt = text_qa_template.format(query_str=user_input)
        response = self.query_engine.query(prompt)
        logging.info(f"LLM Response: {response.message.content.strip()}")
        response_text = response.message.content.strip()
        lines = response_text.split('\n')
        
        chart_details = {
            "chart_type": "bar",  # Default value
            "title": "Generated Chart",
            "x_axis_label": "X Axis",
            "y_axis_label": "Y Axis",
            "hover_labels": []
        }

        for line in lines:
            if "Chart type:" in line:
                chart_details["chart_type"] = line.split(":", 1)[1].strip().lower()
            elif "Title:" in line:
                chart_details["title"] = line.split(":", 1)[1].strip()
            elif "X-axis label:" in line:
                chart_details["x_axis_label"] = line.split(":", 1)[1].strip()
            elif "Y-axis label:" in line:
                chart_details["y_axis_label"] = line.split(":", 1)[1].strip()
            elif "Hover data:" in line:
                chart_details["hover_labels"] = [col.strip().lower().replace(' ', '_') for col in line.split(":", 1)[1].strip().split(',')]
        
        return chart_details

    def generate_visual(self, query_result, chart_details):
        if query_result is None or query_result.empty:
            return None

        hovertext = {col: True for col in chart_details['hover_labels'] if col in query_result.columns} if chart_details['hover_labels'] else None

        if chart_details['chart_type'] == 'bar':
            fig = go.Figure(data=[go.Bar(x=query_result[query_result.columns[0]], y=query_result[query_result.columns[1]], hovertext=query_result[hovertext] if hovertext else None)])
        elif chart_details['chart_type'] == 'line':
            fig = go.Figure(data=[go.Scatter(x=query_result[query_result.columns[0]], y=query_result[query_result.columns[1]], mode='lines', hovertext=query_result[hovertext] if hovertext else None)])
        elif chart_details['chart_type'] == 'pie':
            fig = go.Figure(data=[go.Pie(labels=query_result[query_result.columns[0]], values=query_result[query_result.columns[1]], hovertext=query_result[hovertext] if hovertext else None)])
        else:
            fig = go.Figure(data=[go.Bar(x=query_result[query_result.columns[0]], y=query_result[query_result.columns[1]], hovertext=query_result[hovertext] if hovertext else None)])
        
        fig.update_layout(
            title=chart_details['title'],
            xaxis_title=chart_details['x_axis_label'],
            yaxis_title=chart_details['y_axis_label']
        )
        
        return fig

    def _create_tool(self, name, description):
        return QueryEngineTool(
            query_engine=self.query_engine,
            metadata=ToolMetadata(
                name=name,
                description=description
            )
        )


use this for the below: https://docs.llamaindex.ai/en/stable/examples/query_engine/SQLAutoVectorQueryEngine/

In [None]:
import logging
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.readers.file.base import SimpleDirectoryReader
from llama_index.core.tools import QueryEngineTool, ToolMetadata

class KnowledgeDatabase:
    def __init__(self, knowledge_dir):
        self.knowledge_index = self._load_or_create_index(knowledge_dir, "knowledge")
        self.knowledge_tool = self._create_knowledge_tool()

    def _load_or_create_index(self, directory, index_type):
        try:
            storage_context = StorageContext.from_defaults(persist_dir=directory)
            index = load_index_from_storage(storage_context)
            logging.info(f"{index_type.capitalize()} index loaded successfully.")
        except Exception as e:
            logging.warning(f"Failed to load {index_type} index: {str(e)}. Creating new index.")
            docs = SimpleDirectoryReader(input_dir=directory).load_data()
            index = VectorStoreIndex.from_documents(docs)
            index.storage_context.persist(persist_dir=directory)
            logging.info(f"New {index_type} index created and persisted.")
        return index

    def _create_knowledge_tool(self):
        return QueryEngineTool(
            query_engine=self.knowledge_index.as_query_engine(),
            metadata=ToolMetadata(
                name="knowledge database",
                description=(
                    "Provides detailed information about auto parts inventory management. "
                    "Acts as a strategic advisor for parts managers with data driven insights. "
                    "Use a detailed plain text question as input to the tool."
                )
            )
        )

    def get_strategic_advice(self, context):
        return self.knowledge_tool.query(context)

In [7]:
class PerformanceSummaryTool(SQLQueryTool):
    def __init__(self, engine, name, description):
        super().__init__(engine, name, description)  # Call SQLQueryTool's __init__
        self.query_tool = SQLQueryTool(engine, name, description)
        self.visual_tool = VisualQueryTool(engine, name, description)
        self.name = name
        self.description = description

    def generate_summary(self):
        # Sales Revenue
        sales_revenue_query = "SELECT SUM(revenue) as total_revenue FROM materialized_sales_revenue"
        sales_revenue, _ = self.query_tool.query(sales_revenue_query)
        
        # Gross Profit
        gross_profit_query = "SELECT SUM(gross_profit) as total_gross_profit FROM materialized_gross_profit"
        gross_profit, _ = self.query_tool.query(gross_profit_query)

        gross_margin_query = "SELECT avg_profit_margin FROM materialized_gross_profit_margin"
        gross_margin, _ = self.query_tool.query(gross_margin_query)

        turnover_query = "SELECT AVG(turnover) as avg_turnover FROM temporal_metrics"
        turnover, _ = self.query_tool.query(turnover_query)
        
        # Percentage of Inventory Obsolete
        inventory_obsolete_query = "SELECT SUM(obsolescence_risk) / SUM(quantity) as percent_obsolete FROM part_metrics"
        inventory_obsolete, _ = self.query_tool.query(inventory_obsolete_query)
        inventory_obsolete_chart = self.visual_tool.query(inventory_obsolete_query)

        monthly_gross_profit_query = "SELECT SUM(gross_profit) as total_gross_profit FROM materialized_monthly_gross_profit_ytd"
        monthly_gross_profit,_ = self.query_tool.query(monthly_gross_profit_query)
        monthly_gross_profit_chart = self.visual_tool.query(monthly_gross_profit_query)
        
        # Total Cost of Obsolescence
        total_cost_obsolescence_query = "SELECT SUM(total_cost) as total_cost_obsolescence FROM materialized_total_cost WHERE part_status = 'obsolete'"
        total_cost_obsolescence, _ = self.query_tool.query(total_cost_obsolescence_query)
        
        # Top Performing Brands
        top_brands_query = "SELECT supplier_name, SUM(gross_profit) as total_gross_profit FROM materialized_gross_profit GROUP BY supplier_name ORDER BY total_gross_profit DESC LIMIT 5"
        top_brands, _ = self.query_tool.query(top_brands_query)
        
        # Text Analysis on How to Reduce Obsolescence
        knowledge_base_query = (
            "Provide a summary of strategies to reduce obsolescence in inventory management, "
            "considering factors like ordering practices, supplier relationships, and sales trends."
        )
        response = self.query_tool.query_engine.query(knowledge_base_query)
        reduction_strategies = response.message.content.strip()
        
        summary = {
            'sales_revenue': sales_revenue.to_string() if sales_revenue is not None else "No data",
            'gross_profit': gross_profit.to_string() if gross_profit is not None else "No data",
            'gross_margin': gross_margin.to_string() if gross_margin is not None else "No data",
            'turnover':turnover.to_string() if turnover is not None else "No data",
            'percent_obsolete': inventory_obsolete.to_string() if inventory_obsolete is not None else "No data",
            'total_cost_obsolescence': total_cost_obsolescence.to_string() if total_cost_obsolescence is not None else "No data",
            'top_brands': top_brands.to_string() if top_brands is not None else "No data",
            'monthly_gross_profit_ytd': monthly_gross_profit.to_string() if monthly_gross_profit is not None else "No data",
            'reduction_strategies': reduction_strategies,
            'visualizations': {
                'inventory_obsolete_chart': inventory_obsolete_chart['visual'] if inventory_obsolete_chart is not None else None,
                'monthly_gross_profit_chart': monthly_gross_profit_chart['visual'] if monthly_gross_profit_chart is not None else None
            }
        }
        
        return summary
    def _create_tool(self, name, description):
        return QueryEngineTool(
            query_engine=self.query_engine,
            metadata=ToolMetadata(
                name=name,
                description=description
            )
        )
        

In [8]:
from datetime import datetime
class DateTool:
    def __init__(self):
        self.tool = self._create_tool()

    def get_current_year_month(self):
        """
        Get the current year and month. For temporal queries like: "how many sales of part 123456 have sold this year so far?"

        Returns:
            tuple: A tuple containing the current year and month.
        """
        current_date = datetime.now()
        return current_date.year, current_date.month

    def _create_tool(self):
        return FunctionTool(
            metadata=ToolMetadata(
                description="Provides the current year and month. Useful for temporal queries.",
                name="date_tool",
            ),
            fn=self.get_current_year_month
        )
    
    

### Making tools for the functions used to analyze problems in inventory ###

In [9]:
import os
import logging
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Document,
    StorageContext,
    load_index_from_storage,
)

# Assuming these directories are correct
knowledge_dir = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/knowledge_database"
tools_dir = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/notebooks/tools_data"
db_file_path = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/databases/partswise_database.db"
engine = create_engine(f'sqlite:///{db_file_path}')

# Initialize and load indices
def load_or_create_index(directory, index_type):
    try:
        if not os.listdir(directory):  # Check if directory is empty
            raise FileNotFoundError("Directory is empty")
        
        storage_context = StorageContext.from_defaults(persist_dir=directory)
        index = load_index_from_storage(storage_context)
        logging.info(f"{index_type.capitalize()} index loaded successfully.")
        return index
    except FileNotFoundError:
        logging.info(f"{index_type.capitalize()} index not found or directory is empty. Creating new one.")
        # Create a dummy document if the directory is empty
        if not os.listdir(directory):
            dummy_doc = Document(text="Dummy document", doc_id="dummy")
            docs = [dummy_doc]
        else:
            docs = SimpleDirectoryReader(input_dir=directory).load_data()
        
        index = VectorStoreIndex.from_documents(docs)
        index.storage_context.persist(persist_dir=directory)
        logging.info(f"New {index_type.capitalize()} index created and persisted successfully.")
        return index
    except Exception as e:
        logging.error(f"An unexpected error occurred while loading {index_type} index: {str(e)}")
        raise

knowledge_index = load_or_create_index(knowledge_dir, "knowledge")
tools_index = load_or_create_index(tools_dir, "tools")

# Create query engines
def create_query_engine(index, top_k=3):
    try:
        engine = index.as_query_engine(similarity_top_k=top_k)
        logging.info("Query engine created successfully.")
        return engine
    except Exception as e:
        logging.error(f"Failed to create query engine: {str(e)}")
        raise

knowledge_engine = create_query_engine(knowledge_index)
tools_engine = create_query_engine(tools_index)

class ToolCollector:
    @staticmethod
    def collect_tools(date_tool, knowledge_database, sql_tool, visual_tool, analysis_tool):
        tools = [
            date_tool._create_tool(),
            knowledge_database.knowledge_tool,  # Use the knowledge_tool directly
            sql_tool._create_tool("SQL Query Tool", "Tool for in-depth analysis of parts department performance and inventory management improvement and best practices"),
            visual_tool._create_tool("Visual Query Tool", "Tool for generating visual representations of the query results to enhance user understanding of their data and performance."),
            analysis_tool._create_tool('Analysis tool', "Used to summarize and analyze parts department performance and provide an actionable snapshot of the data")
        ]
        return tools

# Assuming you have instances of your tool classes and the correct directory paths
knowledge_dir = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/knowledge_database"
date_tool_instance = DateTool()
knowledge_database_instance = KnowledgeDatabase(knowledge_dir)
sql_tool_instance = SQLQueryTool(
    engine=engine, 
    name="SQL Query Tool",
    description="Tool for in-depth analysis of parts department performance and inventory management"
)

# Create VisualQueryTool instance
visual_tool_instance = VisualQueryTool(
    engine=engine,
    name="Visual Query Tool",
    description="Tool for generating visual representations of the query results to enhance user understanding of their data."
)

# Create PerformanceSummaryTool instance
analysis_tool_instance = PerformanceSummaryTool(
    engine=engine,
    name="Analysis tool",
    description="Used to summarize and analyze parts department performance"
)


class ToolManager:
    def __init__(self, tools):
        self.tools = tools
        self.tool_documents = self._create_tool_documents()
        self.vector_index = self._create_vector_index()

    def _create_tool_documents(self):
        documents = []
        for i, tool in enumerate(self.tools):
            doc_content = f"{tool.metadata.name}: {tool.metadata.description}"
            documents.append(Document(text=doc_content, doc_id=str(i)))
        return documents
    
    def _create_vector_index(self):
        directory = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/notebooks/tools_data"
        
        if not os.path.exists(directory):
            os.makedirs(directory)
            print(f"Created directory: {directory}")
        
        try:
            # Try to load existing index
            storage_context = StorageContext.from_defaults(persist_dir=directory)
            index = load_index_from_storage(storage_context)
            print("Loaded existing vector index.")
        except (FileNotFoundError, ValueError):
            # If loading fails, create a new index
            print("Creating new vector index.")
            storage_context = StorageContext.from_defaults(persist_dir=directory)
            index = VectorStoreIndex.from_documents(self.tool_documents, storage_context=storage_context)
            index.storage_context.persist(persist_dir=directory)
            print(f"New index persisted to {directory}")
        
        return index

    def select_best_tools(self, query, num_tools=3):
        query_engine = self.vector_index.as_query_engine()
        response = query_engine.query(query)
        selected_tools = []
        for node in response.source_nodes[:num_tools]:
            tool_name = node.node.text.split(":")[0].strip()
            selected_tool = next((tool for tool in self.tools if tool.metadata.name == tool_name), None)
            if selected_tool:
                selected_tools.append(selected_tool)
        return selected_tools


AttributeError: 'SQLQueryTool' object has no attribute 'context_str_combined'

### Chatbot agent that uses tools ###
This is the first agent that I worked on. It uses a chatbot agent and the tools from above to give answers to the user input. 

To incorporate the knowledge database, I created a vector store index which I used to make a query engine tool. The chatbot agent was then initialized with the function tools and the query engine tools so that it could pull information from the inventory data and the knowledge database. 

In [None]:
from llama_index.core.agent import ReActAgent
from llama_index.llms.openai import OpenAI



all_tools = ToolCollector.collect_tools(
    date_tool_instance,
    knowledge_database_instance,
    sql_tool_instance,
    visual_tool_instance,
    analysis_tool_instance
)

tool_manager = ToolManager(all_tools)

tool_manager.tools = all_tools
tool_manager.tool_documents = tool_manager._create_tool_documents()
tool_manager.vector_index = tool_manager._create_vector_index()
all_tools_map = {t.metadata.name: t for t in all_tools}

tool_docs = [Document(text=f"{tool.metadata.name}: {tool.metadata.description}", doc_id=str(i)) for i, tool in enumerate(all_tools)]

# Create a vector index of tool descriptions
tool_index = VectorStoreIndex.from_documents(tool_docs)

def select_best_tools(query, num_tools=3):
    query_engine = tool_index.as_query_engine()
    response = query_engine.query(query)
    selected_tools = []
    for node in response.source_nodes[:num_tools]:
        tool_name = node.node.text.split(":")[0].strip()
        selected_tool = next((tool for tool in all_tools if tool.metadata.name == tool_name), None)
        if selected_tool:
            selected_tools.append(selected_tool)
    return selected_tools


agent = ReActAgent.from_tools(
    all_tools,
    llm=OpenAI(temperature=0.1, model="gpt-4o-mini-2024-07-18"),
    verbose=True
)

try:
    while True:
        text_input = input("Query (type 'bye' or 'exit' to quit the program): ")
        if text_input.lower() in ["exit", "bye"]:
            break
        
        best_tools = select_best_tools(text_input)
        print(f"Selected tools: {', '.join(tool.metadata.name for tool in best_tools)}")
        
        tool_descriptions = "\n".join([f"{tool.metadata.name}: {tool.metadata.description}" for tool in best_tools])
        prompt = f"""Based on the query, these tools seem most relevant:

{tool_descriptions}

Please prioritize using these tools to answer the following query. If these tools are not sufficient, you may use other available tools as needed.

Query: {text_input}"""

        response = agent.chat(prompt)
        print(f"Agent: {response}")
except Exception as e:
    logging.error("An error occurred: %s", str(e))

Loaded existing vector index.
Loaded existing vector index.
Selected tools: Analysis tool, SQL Query Tool
> Running step abf30ccd-702c-478e-822d-945b3dca8254. Step input: Based on the query, these tools seem most relevant:

Analysis tool: Used to summarize and analyze parts department performance and provide an actionable snapshot of the data
SQL Query Tool: Tool for in-depth analysis of parts department performance and inventory management improvement and best practices

Please prioritize using these tools to answer the following query. If these tools are not sufficient, you may use other available tools as needed.

Query: What bran had the most sales in June 2023?
[1;3;38;5;200mThought: The current language of the user is: English. I need to use the SQL Query Tool to analyze the sales data for June 2023 to determine which brand had the most sales.
Action: SQL
Action Input: {'input': "SELECT brand, SUM(sales) as total_sales FROM sales_data WHERE sale_date BETWEEN '2023-06-01' AND '20

### Different agents for each tool ###

Creating crewai tools from the function tools above

Making agents for each tool and tasks that we want the agent to solve. Might be able to consolidate a few of the agents into a single one with multiple tools

example:
- Low ROI, obsolescence and high days supply are related
- Negative on hand quantity and stockouts are related

Tasks:
1. Summarize the entire parts department displaying KPI's a couple graphs like a bar chart with gross profit, sales revenue, and cogs for each month and a pie chart that breaks down the inventory category preentage. Provide some detailed information about the parts department and return some example tabular data and maybe a csv of problem parts?
2. Targeted analysis, provide data backed targeted responses for particular queries
3. Strategic advice: based on the state of the inventory and the questions asked, provide data driven strategic advice on hwo to help solve or treat the problem areas

2 and 3 might be able to be combined. Not sure how we would design the summary I was thinking something like a stock analysis

In [4]:
# Load model directly
import torch
from transformers import AutoTokenizer
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
auth_token = "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"


from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=auth_token,
)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

# generate_kwargs parameters are taken from https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct
llm = HuggingFaceLLM(
    model_name=model_name,
    model_kwargs={
        "token": auth_token,
        "torch_dtype": torch.bfloat16,
    },
    generate_kwargs={
        "do_sample": True,
        "temperature": 0.7,
        "top_p": 0.95,
    },
    tokenizer_name=model_name,
    tokenizer_kwargs={"token": auth_token},
    stopping_ids=stopping_ids,
    device_map="mps",
    is_chat_model=True,
    system_prompt = (
        "Auto parts inventory management AI assistant. Functions:"
        "1. Analyze queries using appropriate tools (eg. SQLQueryTool, KnowledgeDatabase)."
        "2. Convert text to SQL when needed."
        "3. Interpret data for strategic advice."
        "4. Recommend actions to improve turnover, reduce obsolescence, and enhance performance."
        "5. Always support with specific data/metrics."
        "6. Consider business context and impact."
        "Process: Select tools, gather data, synthesize outputs, execute SQL if needed."
        "Respond clearly, concisely, with data-backed insights addressing the query."
    )
)


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:  65%|######4   | 3.23G/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 9.02 GB, other allocations: 384.00 KB, max allowed: 9.07 GB). Tried to allocate 112.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
response = llm.complete("when was the last time a canadian NHL team won the stanley cup and which team was it?")