Create a an LLM pipeline that will transform any free text query into a SQL query, the key points of this task are:

* Create a valid representation of SQL tables allowing for semantic search that will match the top results to the given free text query

* Based on the table representation the LLM has to create a real SQL query, based on free text user query, that will allow for immediate usage

* LLM has to support creation of queries with different levels of complexity not only the simplest ones

* LLM has to support creating queries to fetch data from different database schemas

* When an error in LLM created sql query is encountered it should attempt to self correct

In [1]:
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
import sqlparse
import subprocess
import csv

In [2]:
llm = LlamaCpp(model_path="sqlcoder2-GGUF\sqlcoder2.Q8_0.gguf",
               n_batch=512,
               n_ctx=2048,
               n_gpu_layers=30,
               verbose=True)

llama_model_loader: loaded meta data with 19 key-value pairs and 485 tensors from sqlcoder2-GGUF\sqlcoder2.Q8_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = starcoder
llama_model_loader: - kv   1:                               general.name str              = StarCoder
llama_model_loader: - kv   2:                   starcoder.context_length u32              = 8192
llama_model_loader: - kv   3:                 starcoder.embedding_length u32              = 6144
llama_model_loader: - kv   4:              starcoder.feed_forward_length u32              = 24576
llama_model_loader: - kv   5:                      starcoder.block_count u32              = 40
llama_model_loader: - kv   6:             starcoder.attention.head_count u32              = 48
llama_model_loader: - kv   7:          starcoder.attention.head_count_kv u32     

In [3]:
generation_template = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Generate a SQL query to answer this question: `{user_question}`
If the question does not match any existing tables or columns, return the word 'error' without generating a SQL query.

DDL statements:
{ddl_statements}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The following SQL query best answers the question `{user_question}`:
"""

In [4]:
verification_template = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Verify if this SQL query correctly answers the question: {user_question}.
SQL query: {sql_query}
If yes, return the same query. If not return corrected query.

DDL statements:
{ddl_statements}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The following SQL query best answers the question `{user_question}`:
```sql
"""

In [5]:
def is_sql_query_valid(sql_query):
    """
    Checks if the SQL query is valid based on the content of the query.
    Returns True if valid, False if an error is detected.
    """
    if "error" in sql_query.lower():
        return False

    return True

In [6]:
generation_prompt = PromptTemplate(template=generation_template, input_variables=["user_question", "ddl_statements"])

In [7]:
verification_prompt = PromptTemplate(template=verification_template, input_variables=["user_question", "sql_query", "ddl_statements"])

In [8]:
def read_ddl_from_file(file_path):
    with open(file_path, 'r') as file:
        ddl_statements = file.read()
    return ddl_statements

In [9]:
def get_sql_query_from_llm(prompt, llm, user_question, ddl_statements):
    llm_chain = prompt | llm
    raw_llm_answer = llm_chain.invoke({"user_question": user_question, "ddl_statements": ddl_statements})
    return raw_llm_answer.strip()
    

In [10]:
def is_query_correct(sql_query):
    """
    Validate and fix a SQL query using sqlfluff. Return True if the query is correct after fixing,
    otherwise return False.
    """
    # Print the original SQL query (for debugging purposes)
    print("Original SQL Query:")
    print(sql_query)
    print("\n")

    # Write the SQL query to a temporary file
    with open("temp_query.sql", "w") as f:
        f.write(sql_query)
    
    # Run sqlfluff fix on the temporary file
    try:
        fix_result = subprocess.run(
            ["sqlfluff", "fix", "temp_query.sql", "--dialect", "ansi"],
            capture_output=True,
            text=True
        )
        
        # Print the fixed SQL query (for debugging purposes)
        with open("temp_query.sql", "r") as f:
            fixed_sql_query = f.read()

        print("Fixed SQL Query:")
        print(fixed_sql_query)
        print("\n")
        
        # Run sqlfluff lint on the fixed file
        lint_result = subprocess.run(
            ["sqlfluff", "lint", "temp_query.sql", "--dialect", "ansi"],
            capture_output=True,
            text=True
        )
        
        if lint_result.returncode == 0:
            # If lint passes, return True
            return True
        else:
            # If lint fails, check for errors and return False
            output_lines = lint_result.stderr.splitlines()
            errors = [line for line in output_lines if "error" in line.lower()]
            
            if errors:
                print("Errors found in the SQL query:")
                print("\n".join(errors))
            
            return False
    
    except Exception as e:
        print(f"Error running sqlfluff: {e}")
        return False

In [11]:
def verify_and_correct_sql(sql_query, user_question, ddl_statements):
    if not is_query_correct(sql_query):
        llm_chain = verification_prompt | llm
        raw_llm_answer = llm_chain.invoke({"user_question": user_question, "sql_query": sql_query, "ddl_statements": ddl_statements})
        return raw_llm_answer.strip()
    else:
        return sql_query

In [12]:
def text_to_sql_pipe(user_question, ddl_file_path="database.sql"):
    # Read the DDL statements from the .sql file
    ddl_statements = read_ddl_from_file(ddl_file_path)
    
    # Generate the initial SQL query
    sql_query = get_sql_query_from_llm(generation_prompt, llm, user_question, ddl_statements)
    print(f"Generated SQL Query:\n{sql_query}\n")
    
    if is_sql_query_valid(sql_query):
        # Verify and potentially correct the SQL query
        final_sql_query = verify_and_correct_sql(sql_query, user_question, ddl_statements)
        print(f"Final SQL Query After Verification and Correction:\n{final_sql_query}\n")
        
        return final_sql_query
    
    else:
        return "The query does not match any existing tables. Please check the table names or columns and try again."

In [13]:
def save_queries_to_csv(queries, filename):
    """
    Save the text queries and their corresponding SQL results to a CSV file.
    """
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file, delimiter=';')
        writer.writerow(["Text Query", "Generated SQL Query"])
        
        for text_query in queries:
            # Generate SQL query from text query
            generated_sql = text_to_sql_pipe(text_query)
            
            # Write the text query and the generated SQL query to the CSV
            writer.writerow([text_query, generated_sql])
            print(f"Saved query: {text_query}\nGenerated SQL:\n{generated_sql}\n")

In [14]:
queries = [
    "Show me the names of employees and their departments.",
    "What is the highest salary?",
    "List all orders that were made in the last 30 days.",
    "What are the total sales grouped by region?",
    "List employees whose salary is greater than $50,000.",
    "Get the names of customers and their lifetime value.",
    "Show products where the sales quantity is above 100.",
    "Which departments are managed by the employee with ID 5?",
    "List all sales representatives working in the 'West' region.",
    "What is the most recent sales report date?",
    "Find the average salary of employees in each department.",
    "List customers who have placed more than 5 orders.",
    "Find the total revenue generated by each product.",
    "Show the department with the highest number of employees.",
    "Get the total number of sales representatives in each region.",
    "Find the average lifetime value of customers grouped by region.",
    "List the top 3 products with the highest revenue.",
    "Find employees who have not received a salary in the last year.",
    "Show the total number of orders per customer.",
    "List all departments with more than one manager."
]


In [15]:
csv_filename = "sql_queries_results.csv"
save_queries_to_csv(queries, csv_filename)


llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =     178.35 ms /    45 runs   (    3.96 ms per token,   252.31 tokens per second)
llama_print_timings: prompt eval time =   58007.48 ms /   681 tokens (   85.18 ms per token,    11.74 tokens per second)
llama_print_timings:        eval time =  178087.58 ms /    44 runs   ( 4047.45 ms per token,     0.25 tokens per second)
llama_print_timings:       total time =  290884.22 ms /   725 tokens


Generated SQL Query:
SELECT e.first_name, e.last_name, d.department_name FROM sales.employees AS e JOIN sales.departments AS d ON e.department_id = d.department_id;

Original SQL Query:
SELECT e.first_name, e.last_name, d.department_name FROM sales.employees AS e JOIN sales.departments AS d ON e.department_id = d.department_id;


Fixed SQL Query:
SELECT
    e.first_name,
    e.last_name,
    d.department_name
FROM sales.employees AS e
INNER JOIN sales.departments AS d ON e.department_id = d.department_id;



Final SQL Query After Verification and Correction:
SELECT e.first_name, e.last_name, d.department_name FROM sales.employees AS e JOIN sales.departments AS d ON e.department_id = d.department_id;

Saved query: Show me the names of employees and their departments.
Generated SQL:
SELECT e.first_name, e.last_name, d.department_name FROM sales.employees AS e JOIN sales.departments AS d ON e.department_id = d.department_id;



Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       3.23 ms /    23 runs   (    0.14 ms per token,  7118.54 tokens per second)
llama_print_timings: prompt eval time =   36167.02 ms /   632 tokens (   57.23 ms per token,    17.47 tokens per second)
llama_print_timings:        eval time =    7847.61 ms /    22 runs   (  356.71 ms per token,     2.80 tokens per second)
llama_print_timings:       total time =   46556.98 ms /   654 tokens


Generated SQL Query:
SELECT MAX(salaries.salary_amount::TEXT::INT) AS max_salary FROM salaries;

Original SQL Query:
SELECT MAX(salaries.salary_amount::TEXT::INT) AS max_salary FROM salaries;


Fixed SQL Query:
SELECT MAX(salaries.salary_amount::TEXT::INT) AS max_salary FROM salaries;



Final SQL Query After Verification and Correction:
SELECT MAX(salaries.salary_amount::TEXT::INT) AS max_salary FROM salaries;

Saved query: What is the highest salary?
Generated SQL:
SELECT MAX(salaries.salary_amount::TEXT::INT) AS max_salary FROM salaries;



Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       7.05 ms /    51 runs   (    0.14 ms per token,  7236.10 tokens per second)
llama_print_timings: prompt eval time =   44680.83 ms /   648 tokens (   68.95 ms per token,    14.50 tokens per second)
llama_print_timings:        eval time =   20028.41 ms /    50 runs   (  400.57 ms per token,     2.50 tokens per second)
llama_print_timings:       total time =   64961.53 ms /   698 tokens


Generated SQL Query:
SELECT sales.orders.order_date, sales.orders.customer_id, sales.orders.sales_rep_id FROM sales.orders WHERE sales.orders.order_date >= (CURRENT_DATE - interval '30 day');

Original SQL Query:
SELECT sales.orders.order_date, sales.orders.customer_id, sales.orders.sales_rep_id FROM sales.orders WHERE sales.orders.order_date >= (CURRENT_DATE - interval '30 day');


Fixed SQL Query:
SELECT
    sales.orders.order_date,
    sales.orders.customer_id,
    sales.orders.sales_rep_id
FROM sales.orders
WHERE sales.orders.order_date >= (CURRENT_DATE - INTERVAL '30 day');



Final SQL Query After Verification and Correction:
SELECT sales.orders.order_date, sales.orders.customer_id, sales.orders.sales_rep_id FROM sales.orders WHERE sales.orders.order_date >= (CURRENT_DATE - interval '30 day');

Saved query: List all orders that were made in the last 30 days.
Generated SQL:
SELECT sales.orders.order_date, sales.orders.customer_id, sales.orders.sales_rep_id FROM sales.orders WHERE 

Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       5.76 ms /    42 runs   (    0.14 ms per token,  7286.61 tokens per second)
llama_print_timings: prompt eval time =   44031.62 ms /   638 tokens (   69.02 ms per token,    14.49 tokens per second)
llama_print_timings:        eval time =   17565.09 ms /    41 runs   (  428.42 ms per token,     2.33 tokens per second)
llama_print_timings:       total time =   62064.62 ms /   679 tokens


Generated SQL Query:
SELECT analytics.sales_reports.region, SUM(analytics.sales_reports.total_sales) AS total_sale FROM analytics.sales_reports GROUP BY analytics.sales_reports.region;

Original SQL Query:
SELECT analytics.sales_reports.region, SUM(analytics.sales_reports.total_sales) AS total_sale FROM analytics.sales_reports GROUP BY analytics.sales_reports.region;


Fixed SQL Query:
SELECT
    analytics.sales_reports.region,
    SUM(analytics.sales_reports.total_sales) AS total_sale
FROM analytics.sales_reports
GROUP BY analytics.sales_reports.region;



Final SQL Query After Verification and Correction:
SELECT analytics.sales_reports.region, SUM(analytics.sales_reports.total_sales) AS total_sale FROM analytics.sales_reports GROUP BY analytics.sales_reports.region;

Saved query: What are the total sales grouped by region?
Generated SQL:
SELECT analytics.sales_reports.region, SUM(analytics.sales_reports.total_sales) AS total_sale FROM analytics.sales_reports GROUP BY analytics.sales_

Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       7.68 ms /    56 runs   (    0.14 ms per token,  7294.52 tokens per second)
llama_print_timings: prompt eval time =   46649.45 ms /   652 tokens (   71.55 ms per token,    13.98 tokens per second)
llama_print_timings:        eval time =   22115.12 ms /    55 runs   (  402.09 ms per token,     2.49 tokens per second)
llama_print_timings:       total time =   69007.37 ms /   707 tokens


Generated SQL Query:
SELECT employee_first_name, employee_last_name FROM sales.employees JOIN sales.salaries ON sales.employees.employee_id = sales.salaries.employee_id WHERE sales.salaries.salary_amount > 50000;

Original SQL Query:
SELECT employee_first_name, employee_last_name FROM sales.employees JOIN sales.salaries ON sales.employees.employee_id = sales.salaries.employee_id WHERE sales.salaries.salary_amount > 50000;


Fixed SQL Query:
SELECT
    employee_first_name,
    employee_last_name
FROM sales.employees
INNER JOIN
    sales.salaries
    ON sales.employees.employee_id = sales.salaries.employee_id
WHERE sales.salaries.salary_amount > 50000;





Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       8.17 ms /    60 runs   (    0.14 ms per token,  7343.94 tokens per second)
llama_print_timings: prompt eval time =   46000.71 ms /   714 tokens (   64.43 ms per token,    15.52 tokens per second)
llama_print_timings:        eval time =   23206.17 ms /    59 runs   (  393.32 ms per token,     2.54 tokens per second)
llama_print_timings:       total time =   69383.42 ms /   773 tokens
Llama.generate: prefix-match hit


Final SQL Query After Verification and Correction:
SELECT sales.employees.first_name, sales.employees.last_name FROM sales.employees JOIN sales.salaries ON sales.employees.employee_id = sales.salaries.employee_id WHERE sales.salaries.salary_amount > 50000;

Saved query: List employees whose salary is greater than $50,000.
Generated SQL:
SELECT sales.employees.first_name, sales.employees.last_name FROM sales.employees JOIN sales.salaries ON sales.employees.employee_id = sales.salaries.employee_id WHERE sales.salaries.salary_amount > 50000;




llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       6.10 ms /    44 runs   (    0.14 ms per token,  7209.57 tokens per second)
llama_print_timings: prompt eval time =   42353.34 ms /   650 tokens (   65.16 ms per token,    15.35 tokens per second)
llama_print_timings:        eval time =   16653.16 ms /    43 runs   (  387.28 ms per token,     2.58 tokens per second)
llama_print_timings:       total time =   59145.49 ms /   693 tokens


Generated SQL Query:
SELECT customer_name, lifetime_value::TEXT AS lifetime_value FROM sales.customers JOIN analytics.customer_metrics ON sales.customers.customer_id = analytics.customer_metrics.customer_id;

Original SQL Query:
SELECT customer_name, lifetime_value::TEXT AS lifetime_value FROM sales.customers JOIN analytics.customer_metrics ON sales.customers.customer_id = analytics.customer_metrics.customer_id;


Fixed SQL Query:
SELECT
    customer_name,
    lifetime_value::TEXT AS lifetime_value
FROM sales.customers
INNER JOIN
    analytics.customer_metrics
    ON sales.customers.customer_id = analytics.customer_metrics.customer_id;





Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       6.01 ms /    44 runs   (    0.14 ms per token,  7319.91 tokens per second)
llama_print_timings: prompt eval time =   44166.38 ms /   689 tokens (   64.10 ms per token,    15.60 tokens per second)
llama_print_timings:        eval time =   16783.73 ms /    43 runs   (  390.32 ms per token,     2.56 tokens per second)
llama_print_timings:       total time =   61039.32 ms /   732 tokens
Llama.generate: prefix-match hit


Final SQL Query After Verification and Correction:
SELECT customer_name, lifetime_value::TEXT AS lifetime_value FROM sales.customers JOIN analytics.customer_metrics ON sales.customers.customer_id = analytics.customer_metrics.customer_id;

Saved query: Get the names of customers and their lifetime value.
Generated SQL:
SELECT customer_name, lifetime_value::TEXT AS lifetime_value FROM sales.customers JOIN analytics.customer_metrics ON sales.customers.customer_id = analytics.customer_metrics.customer_id;




llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       5.12 ms /    37 runs   (    0.14 ms per token,  7233.63 tokens per second)
llama_print_timings: prompt eval time =   43280.44 ms /   656 tokens (   65.98 ms per token,    15.16 tokens per second)
llama_print_timings:        eval time =   13970.04 ms /    36 runs   (  388.06 ms per token,     2.58 tokens per second)
llama_print_timings:       total time =   57308.83 ms /   692 tokens


Generated SQL Query:
SELECT product_performance.product_id, product_performance.sales_quantity FROM analytics.product_performance WHERE product_performance.sales_quantity > 100;

Original SQL Query:
SELECT product_performance.product_id, product_performance.sales_quantity FROM analytics.product_performance WHERE product_performance.sales_quantity > 100;


Fixed SQL Query:
SELECT
    product_performance.product_id,
    product_performance.sales_quantity
FROM analytics.product_performance
WHERE product_performance.sales_quantity > 100;



Final SQL Query After Verification and Correction:
SELECT product_performance.product_id, product_performance.sales_quantity FROM analytics.product_performance WHERE product_performance.sales_quantity > 100;

Saved query: Show products where the sales quantity is above 100.
Generated SQL:
SELECT product_performance.product_id, product_performance.sales_quantity FROM analytics.product_performance WHERE product_performance.sales_quantity > 100;



Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       2.63 ms /    18 runs   (    0.15 ms per token,  6846.71 tokens per second)
llama_print_timings: prompt eval time =   44401.68 ms /   646 tokens (   68.73 ms per token,    14.55 tokens per second)
llama_print_timings:        eval time =    6725.06 ms /    17 runs   (  395.59 ms per token,     2.53 tokens per second)
llama_print_timings:       total time =   51157.33 ms /   663 tokens


Generated SQL Query:
SELECT department_name FROM sales.departments WHERE manager_id = 5;

Original SQL Query:
SELECT department_name FROM sales.departments WHERE manager_id = 5;


Fixed SQL Query:
SELECT department_name FROM sales.departments WHERE manager_id = 5;



Final SQL Query After Verification and Correction:
SELECT department_name FROM sales.departments WHERE manager_id = 5;

Saved query: Which departments are managed by the employee with ID 5?
Generated SQL:
SELECT department_name FROM sales.departments WHERE manager_id = 5;



Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       5.39 ms /    39 runs   (    0.14 ms per token,  7241.00 tokens per second)
llama_print_timings: prompt eval time =   45359.84 ms /   648 tokens (   70.00 ms per token,    14.29 tokens per second)
llama_print_timings:        eval time =   15285.07 ms /    38 runs   (  402.24 ms per token,     2.49 tokens per second)
llama_print_timings:       total time =   60721.25 ms /   686 tokens


Generated SQL Query:
SELECT sales.sales_reps.first_name, sales.sales_reps.last_name FROM sales.sales_reps WHERE sales.sales_reps.region = 'West';

Original SQL Query:
SELECT sales.sales_reps.first_name, sales.sales_reps.last_name FROM sales.sales_reps WHERE sales.sales_reps.region = 'West';


Fixed SQL Query:
SELECT
    sales.sales_reps.first_name,
    sales.sales_reps.last_name
FROM sales.sales_reps
WHERE sales.sales_reps.region = 'West';



Final SQL Query After Verification and Correction:
SELECT sales.sales_reps.first_name, sales.sales_reps.last_name FROM sales.sales_reps WHERE sales.sales_reps.region = 'West';

Saved query: List all sales representatives working in the 'West' region.
Generated SQL:
SELECT sales.sales_reps.first_name, sales.sales_reps.last_name FROM sales.sales_reps WHERE sales.sales_reps.region = 'West';



Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       3.64 ms /    27 runs   (    0.13 ms per token,  7417.58 tokens per second)
llama_print_timings: prompt eval time =   39271.04 ms /   638 tokens (   61.55 ms per token,    16.25 tokens per second)
llama_print_timings:        eval time =   10544.25 ms /    26 runs   (  405.55 ms per token,     2.47 tokens per second)
llama_print_timings:       total time =   49859.73 ms /   664 tokens


Generated SQL Query:
SELECT MAX(report_date::TEXT::DATE) AS latest_sales_report_date FROM analytics.sales_reports;

Original SQL Query:
SELECT MAX(report_date::TEXT::DATE) AS latest_sales_report_date FROM analytics.sales_reports;


Fixed SQL Query:
SELECT MAX(report_date::TEXT::DATE) AS latest_sales_report_date
FROM analytics.sales_reports;



Final SQL Query After Verification and Correction:
SELECT MAX(report_date::TEXT::DATE) AS latest_sales_report_date FROM analytics.sales_reports;

Saved query: What is the most recent sales report date?
Generated SQL:
SELECT MAX(report_date::TEXT::DATE) AS latest_sales_report_date FROM analytics.sales_reports;



Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       9.70 ms /    70 runs   (    0.14 ms per token,  7214.26 tokens per second)
llama_print_timings: prompt eval time =   39503.31 ms /   640 tokens (   61.72 ms per token,    16.20 tokens per second)
llama_print_timings:        eval time =   26960.67 ms /    69 runs   (  390.73 ms per token,     2.56 tokens per second)
llama_print_timings:       total time =   66657.58 ms /   709 tokens


Generated SQL Query:
SELECT d.department_name, AVG(s.salary_amount) AS avg_salary FROM sales.salaries s JOIN sales.employees e ON s.employee_id = e.employee_id JOIN sales.departments d ON e.department_id = d.department_id GROUP BY d.department_name;

Original SQL Query:
SELECT d.department_name, AVG(s.salary_amount) AS avg_salary FROM sales.salaries s JOIN sales.employees e ON s.employee_id = e.employee_id JOIN sales.departments d ON e.department_id = d.department_id GROUP BY d.department_name;


Fixed SQL Query:
SELECT
    d.department_name,
    AVG(s.salary_amount) AS avg_salary
FROM sales.salaries AS s
INNER JOIN sales.employees AS e ON s.employee_id = e.employee_id
INNER JOIN sales.departments AS d ON e.department_id = d.department_id
GROUP BY d.department_name;



Final SQL Query After Verification and Correction:
SELECT d.department_name, AVG(s.salary_amount) AS avg_salary FROM sales.salaries s JOIN sales.employees e ON s.employee_id = e.employee_id JOIN sales.departments d ON e.

Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       5.34 ms /    38 runs   (    0.14 ms per token,  7116.10 tokens per second)
llama_print_timings: prompt eval time =   43398.58 ms /   642 tokens (   67.60 ms per token,    14.79 tokens per second)
llama_print_timings:        eval time =   15858.98 ms /    37 runs   (  428.62 ms per token,     2.33 tokens per second)
llama_print_timings:       total time =   59389.19 ms /   679 tokens


Generated SQL Query:
SELECT customer_name, COUNT(order_id) AS order_count FROM sales.orders GROUP BY customer_name HAVING COUNT(order_id) > 5;

Original SQL Query:
SELECT customer_name, COUNT(order_id) AS order_count FROM sales.orders GROUP BY customer_name HAVING COUNT(order_id) > 5;


Fixed SQL Query:
SELECT
    customer_name,
    COUNT(order_id) AS order_count
FROM sales.orders
GROUP BY customer_name
HAVING COUNT(order_id) > 5;



Final SQL Query After Verification and Correction:
SELECT customer_name, COUNT(order_id) AS order_count FROM sales.orders GROUP BY customer_name HAVING COUNT(order_id) > 5;

Saved query: List customers who have placed more than 5 orders.
Generated SQL:
SELECT customer_name, COUNT(order_id) AS order_count FROM sales.orders GROUP BY customer_name HAVING COUNT(order_id) > 5;



Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       8.91 ms /    63 runs   (    0.14 ms per token,  7071.50 tokens per second)
llama_print_timings: prompt eval time =   47092.49 ms /   640 tokens (   73.58 ms per token,    13.59 tokens per second)
llama_print_timings:        eval time =   27506.16 ms /    62 runs   (  443.65 ms per token,     2.25 tokens per second)
llama_print_timings:       total time =   75997.54 ms /   702 tokens


Generated SQL Query:
SELECT p.product_name, SUM(pp.revenue_generated) AS total_revenue FROM sales.products AS p JOIN analytics.product_performance AS pp ON p.product_id = pp.product_id GROUP BY p.product_name ORDER BY total_revenue DESC;

Original SQL Query:
SELECT p.product_name, SUM(pp.revenue_generated) AS total_revenue FROM sales.products AS p JOIN analytics.product_performance AS pp ON p.product_id = pp.product_id GROUP BY p.product_name ORDER BY total_revenue DESC;


Fixed SQL Query:
SELECT
    p.product_name,
    SUM(pp.revenue_generated) AS total_revenue
FROM sales.products AS p
INNER JOIN analytics.product_performance AS pp ON p.product_id = pp.product_id
GROUP BY p.product_name
ORDER BY total_revenue DESC;



Final SQL Query After Verification and Correction:
SELECT p.product_name, SUM(pp.revenue_generated) AS total_revenue FROM sales.products AS p JOIN analytics.product_performance AS pp ON p.product_id = pp.product_id GROUP BY p.product_name ORDER BY total_revenue DESC;

Sa

Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =      30.46 ms /   256 runs   (    0.12 ms per token,  8405.57 tokens per second)
llama_print_timings: prompt eval time =   38582.63 ms /   640 tokens (   60.29 ms per token,    16.59 tokens per second)
llama_print_timings:        eval time =  101349.81 ms /   255 runs   (  397.45 ms per token,     2.52 tokens per second)
llama_print_timings:       total time =  140616.94 ms /   895 tokens


Generated SQL Query:
SELECT departments.department_name, COUNT(DISTINCT employees.employee_id) AS employee_count FROM employees JOIN sales.orders ON employees.manager_id::text = orders.sales_rep_id::TEXT JOIN analytics.order_items ON orders.order_number::TEXT::integer::int4::text || ''::TEXT::text::int4::text::text::TEXT::numeric::text::int::text::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::NUMERIC::TEXT::INT::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::NUMERIC::TEXT::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::NUMERIC::TEXT::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::numeric::text::int::TEXT::NUMERIC::TEXT

Original SQL Query:
SELECT departments.department_name, COUNT(DISTINCT employees.employee_id) AS employee_cou

Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       9.39 ms /    68 runs   (    0.14 ms per token,  7244.83 tokens per second)
llama_print_timings: prompt eval time =   56658.39 ms /   904 tokens (   62.68 ms per token,    15.96 tokens per second)
llama_print_timings:        eval time =   27853.46 ms /    67 runs   (  415.72 ms per token,     2.41 tokens per second)
llama_print_timings:       total time =   84846.62 ms /   971 tokens
Llama.generate: prefix-match hit


Final SQL Query After Verification and Correction:
SELECT departments.department_name, COUNT(DISTINCT employees.employee_id) AS employee_count FROM employees JOIN sales.orders ON employees.manager_id::text = orders.sales_rep_id::TEXT GROUP BY departments.department_name ORDER BY employee_count DESC NULLS LAST LIMIT 1;

Saved query: Show the department with the highest number of employees.
Generated SQL:
SELECT departments.department_name, COUNT(DISTINCT employees.employee_id) AS employee_count FROM employees JOIN sales.orders ON employees.manager_id::text = orders.sales_rep_id::TEXT GROUP BY departments.department_name ORDER BY employee_count DESC NULLS LAST LIMIT 1;




llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       6.89 ms /    50 runs   (    0.14 ms per token,  7261.11 tokens per second)
llama_print_timings: prompt eval time =   46685.39 ms /   654 tokens (   71.38 ms per token,    14.01 tokens per second)
llama_print_timings:        eval time =   19111.64 ms /    49 runs   (  390.03 ms per token,     2.56 tokens per second)
llama_print_timings:       total time =   65906.89 ms /   703 tokens


Generated SQL Query:
SELECT sales.sales_reps.region, COUNT(DISTINCT sales.sales_reps.sales_rep_id) AS number_of_sales_reps FROM sales.sales_reps GROUP BY sales.sales_reps.region;

Original SQL Query:
SELECT sales.sales_reps.region, COUNT(DISTINCT sales.sales_reps.sales_rep_id) AS number_of_sales_reps FROM sales.sales_reps GROUP BY sales.sales_reps.region;


Fixed SQL Query:
SELECT
    sales.sales_reps.region,
    COUNT(DISTINCT sales.sales_reps.sales_rep_id) AS number_of_sales_reps
FROM sales.sales_reps
GROUP BY sales.sales_reps.region;



Final SQL Query After Verification and Correction:
SELECT sales.sales_reps.region, COUNT(DISTINCT sales.sales_reps.sales_rep_id) AS number_of_sales_reps FROM sales.sales_reps GROUP BY sales.sales_reps.region;

Saved query: Get the total number of sales representatives in each region.
Generated SQL:
SELECT sales.sales_reps.region, COUNT(DISTINCT sales.sales_reps.sales_rep_id) AS number_of_sales_reps FROM sales.sales_reps GROUP BY sales.sales_reps.regi

Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =      10.94 ms /    79 runs   (    0.14 ms per token,  7223.85 tokens per second)
llama_print_timings: prompt eval time =   41956.72 ms /   642 tokens (   65.35 ms per token,    15.30 tokens per second)
llama_print_timings:        eval time =   30562.62 ms /    78 runs   (  391.83 ms per token,     2.55 tokens per second)
llama_print_timings:       total time =   72628.06 ms /   720 tokens


Generated SQL Query:
SELECT sales_reps.region, AVG(customer_metrics.lifetime_value) AS avg_life_time_value FROM customer_metrics JOIN orders ON customer_metrics.customer_id = orders.customer_id JOIN sales_reps ON orders.sales_rep_id::integer = sales_reps.sales_rep_id GROUP BY sales_reps.region;

Original SQL Query:
SELECT sales_reps.region, AVG(customer_metrics.lifetime_value) AS avg_life_time_value FROM customer_metrics JOIN orders ON customer_metrics.customer_id = orders.customer_id JOIN sales_reps ON orders.sales_rep_id::integer = sales_reps.sales_rep_id GROUP BY sales_reps.region;


Fixed SQL Query:
SELECT
    sales_reps.region,
    AVG(customer_metrics.lifetime_value) AS avg_life_time_value
FROM customer_metrics
INNER JOIN orders ON customer_metrics.customer_id = orders.customer_id
INNER JOIN sales_reps ON orders.sales_rep_id::integer = sales_reps.sales_rep_id
GROUP BY sales_reps.region;



Final SQL Query After Verification and Correction:
SELECT sales_reps.region, AVG(customer_m

Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =      15.08 ms /   103 runs   (    0.15 ms per token,  6829.79 tokens per second)
llama_print_timings: prompt eval time =   56236.43 ms /   644 tokens (   87.32 ms per token,    11.45 tokens per second)
llama_print_timings:        eval time =   44744.28 ms /   102 runs   (  438.67 ms per token,     2.28 tokens per second)
llama_print_timings:       total time =  101243.72 ms /   746 tokens


Generated SQL Query:
SELECT p.product_name, to_char(to_timestamp(ppe.revenue_generated::text::integer,'YYYYMMDD'), 'Day DD Mon YYYY') AS sales_day, SUM(ppe.sales_quantity) AS total_sales FROM analytics.product_performance ppe JOIN sales.products p ON ppe.product_id = p.product_id GROUP BY p.product_name, sales_day ORDER BY total_sales DESC LIMIT 3;

Original SQL Query:
SELECT p.product_name, to_char(to_timestamp(ppe.revenue_generated::text::integer,'YYYYMMDD'), 'Day DD Mon YYYY') AS sales_day, SUM(ppe.sales_quantity) AS total_sales FROM analytics.product_performance ppe JOIN sales.products p ON ppe.product_id = p.product_id GROUP BY p.product_name, sales_day ORDER BY total_sales DESC LIMIT 3;


Fixed SQL Query:
SELECT
    p.product_name,
    to_char(
        to_timestamp(ppe.revenue_generated::text::integer, 'YYYYMMDD'),
        'Day DD Mon YYYY'
    ) AS sales_day,
    sum(ppe.sales_quantity) AS total_sales
FROM analytics.product_performance AS ppe
INNER JOIN sales.products AS p ON pp

Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =      11.89 ms /    85 runs   (    0.14 ms per token,  7150.67 tokens per second)
llama_print_timings: prompt eval time =   49535.73 ms /   646 tokens (   76.68 ms per token,    13.04 tokens per second)
llama_print_timings:        eval time =   34098.51 ms /    84 runs   (  405.93 ms per token,     2.46 tokens per second)
llama_print_timings:       total time =   83778.98 ms /   730 tokens


Generated SQL Query:
SELECT e."employee_id", e."first_name", e."last_name" FROM sales.employees AS e WHERE NOT EXISTS (SELECT 1 FROM sales.salaries s WHERE e."employee_id" = s."employee_id" AND s."effective_date" >= CURRENT_DATE - INTERVAL '1 year') ORDER BY e."first_name", e."last_name";

Original SQL Query:
SELECT e."employee_id", e."first_name", e."last_name" FROM sales.employees AS e WHERE NOT EXISTS (SELECT 1 FROM sales.salaries s WHERE e."employee_id" = s."employee_id" AND s."effective_date" >= CURRENT_DATE - INTERVAL '1 year') ORDER BY e."first_name", e."last_name";


Fixed SQL Query:
SELECT
    e.employee_id,
    e.first_name,
    e.last_name
FROM sales.employees AS e
WHERE
    NOT EXISTS (
        SELECT 1
        FROM sales.salaries AS s
        WHERE
            e.employee_id = s.employee_id
            AND s.effective_date >= CURRENT_DATE - INTERVAL '1 year'
    )
ORDER BY e.first_name, e.last_name;



Final SQL Query After Verification and Correction:
SELECT e."employee_id

Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       5.28 ms /    38 runs   (    0.14 ms per token,  7194.24 tokens per second)
llama_print_timings: prompt eval time =   41970.33 ms /   638 tokens (   65.78 ms per token,    15.20 tokens per second)
llama_print_timings:        eval time =   13593.55 ms /    37 runs   (  367.39 ms per token,     2.72 tokens per second)
llama_print_timings:       total time =   55608.21 ms /   675 tokens


Generated SQL Query:
SELECT sales.orders.customer_id, COUNT(sales.orders.order_id) AS order_count FROM sales.orders GROUP BY sales.orders.customer_id;

Original SQL Query:
SELECT sales.orders.customer_id, COUNT(sales.orders.order_id) AS order_count FROM sales.orders GROUP BY sales.orders.customer_id;


Fixed SQL Query:
SELECT
    sales.orders.customer_id,
    COUNT(sales.orders.order_id) AS order_count
FROM sales.orders
GROUP BY sales.orders.customer_id;



Final SQL Query After Verification and Correction:
SELECT sales.orders.customer_id, COUNT(sales.orders.order_id) AS order_count FROM sales.orders GROUP BY sales.orders.customer_id;

Saved query: Show the total number of orders per customer.
Generated SQL:
SELECT sales.orders.customer_id, COUNT(sales.orders.order_id) AS order_count FROM sales.orders GROUP BY sales.orders.customer_id;



Llama.generate: prefix-match hit

llama_print_timings:        load time =   38041.23 ms
llama_print_timings:      sample time =       5.42 ms /    37 runs   (    0.15 ms per token,  6830.35 tokens per second)
llama_print_timings: prompt eval time =   54940.12 ms /   640 tokens (   85.84 ms per token,    11.65 tokens per second)
llama_print_timings:        eval time =   19778.38 ms /    36 runs   (  549.40 ms per token,     1.82 tokens per second)
llama_print_timings:       total time =   74811.94 ms /   676 tokens


Generated SQL Query:
SELECT department_name, COUNT(manager_id) AS managers FROM public.departments GROUP BY department_name HAVING COUNT(manager_id) > 1;

Original SQL Query:
SELECT department_name, COUNT(manager_id) AS managers FROM public.departments GROUP BY department_name HAVING COUNT(manager_id) > 1;


Fixed SQL Query:
SELECT
    department_name,
    COUNT(manager_id) AS managers
FROM public.departments
GROUP BY department_name
HAVING COUNT(manager_id) > 1;



Final SQL Query After Verification and Correction:
SELECT department_name, COUNT(manager_id) AS managers FROM public.departments GROUP BY department_name HAVING COUNT(manager_id) > 1;

Saved query: List all departments with more than one manager.
Generated SQL:
SELECT department_name, COUNT(manager_id) AS managers FROM public.departments GROUP BY department_name HAVING COUNT(manager_id) > 1;

