In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from vertexai.language_models import CodeChatSession
from vertexai.language_models import CodeChatModel
from google.cloud import bigquery
import vertexai
import pandas as pd
import time
import os

In [None]:
# Adjust display settings
pd.set_option('display.max_colwidth', None)  # Set max column width to None to show all content
pd.set_option('display.expand_frame_repr', False)  # Prevent truncation of DataFrame HTML representation


In [None]:
PROJECT_ID = 'pradeep-genai'
MODEL_NAME = 'codechat-bison'
TEMPERATURE = 1
MAX_OUTPUT_TOKENS = 2048  # length of the output response | overridding the default value which is 128
# TOP_P = 0.95  # default value
# TOP_K = 40  # default value
LOCATION = 'us-central1'

In [None]:
DATASET = 'flight_reservations'
TABLES = ['customers', 'flights', 'reservations', 'transactions', 'loyality_points']

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID)
vertexai.init(project=PROJECT_ID, location='us-central1')
model = CodeChatModel.from_pretrained(MODEL_NAME)

In [None]:
query = f"""
    SELECT *
    FROM `{PROJECT_ID}.{DATASET}.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE table_name in ({','.join([f'"{table}"' for table in TABLES])})
"""
print(query)


    SELECT *
    FROM `pradeep-genai.flight_reservations.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE table_name in ("customers","flights","reservations","transactions","loyality_points")



In [None]:
schema_columns = bq_client.query(query=query).to_dataframe()
schema_columns

Unnamed: 0,table_catalog,table_schema,table_name,column_name,field_path,data_type,description,collation_name,rounding_mode
0,pradeep-genai,flight_reservations,transactions,transaction_id,transaction_id,INT64,,,
1,pradeep-genai,flight_reservations,transactions,reservation_id,reservation_id,INT64,,,
2,pradeep-genai,flight_reservations,transactions,amount,amount,FLOAT64,,,
3,pradeep-genai,flight_reservations,transactions,transaction_datetime,transaction_datetime,DATETIME,,,
4,pradeep-genai,flight_reservations,reservations,reservation_id,reservation_id,INT64,,,
5,pradeep-genai,flight_reservations,reservations,customer_id,customer_id,INT64,,,
6,pradeep-genai,flight_reservations,reservations,flight_id,flight_id,INT64,,,
7,pradeep-genai,flight_reservations,reservations,reservation_datetime,reservation_datetime,DATETIME,,,
8,pradeep-genai,flight_reservations,reservations,status,status,STRING,,,
9,pradeep-genai,flight_reservations,flights,flight_id,flight_id,INT64,,,


In [None]:
schema_columns = schema_columns.to_markdown(index=False)
print(schema_columns)

| table_catalog   | table_schema        | table_name   | column_name          | field_path           | data_type   | description   | collation_name   | rounding_mode   |
|:----------------|:--------------------|:-------------|:---------------------|:---------------------|:------------|:--------------|:-----------------|:----------------|
| pradeep-genai   | flight_reservations | transactions | transaction_id       | transaction_id       | INT64       |               | NULL             |                 |
| pradeep-genai   | flight_reservations | transactions | reservation_id       | reservation_id       | INT64       |               | NULL             |                 |
| pradeep-genai   | flight_reservations | transactions | amount               | amount               | FLOAT64     |               | NULL             |                 |
| pradeep-genai   | flight_reservations | transactions | transaction_datetime | transaction_datetime | DATETIME    |               | NULL             

In [None]:
def generate_and_execute_sql(prompt, max_tries=5, return_all=False):
    """
    Generate an SQL query using the code_gen_model, execute it using bq_client, and rank successful queries by latency.

    Args:
    - prompt (str): Prompt to provide to the model for generating SQL.
    - max_tries (int): Maximum number of attempts to generate and execute SQL.
    - return_all (bool): Flag to determine whether to return all successful queries or only the fastest.

    Returns:
    - dict: A dictionary containing the fastest dataframe or all successful dataframes, or error messages and prompt evolution.
    """

    tries = 0
    error_messages = []
    prompts = [prompt]
    successful_queries = []

    chat_session = CodeChatSession(model=model,
                                   temperature=TEMPERATURE,
                                   max_output_tokens=MAX_OUTPUT_TOKENS)

    while tries < max_tries:
        print(f'TRIAL: {tries+1}')
        try:
            # Predict SQL using the model
            start_time = time.time()
            response = chat_session.send_message(prompt, temperature=TEMPERATURE, max_output_tokens=MAX_OUTPUT_TOKENS)
            generated_sql_query = response.text
            generated_sql_query = '\n'.join(generated_sql_query.split('\n')[1:-1])
            print('-' * 50)
            print(generated_sql_query)
            print('-' * 50)
            # Execute SQL using BigQuery client
            df = bq_client.query(generated_sql_query).to_dataframe()
            latency = time.time() - start_time
            successful_queries.append({
                "query": generated_sql_query,
                "dataframe": df,
                "latency": latency
            })
            print('SUCCEEDED')
            # Evolve the prompt for success path to optimize the last successful query for latency
            if len(successful_queries) > 1:
                prompt = f"""Modify the last successful SQL query by making changes to it and optimizing it for latency.
            ENSURE that the NEW QUERY is DIFFERENT from the previous one while prioritizing faster execution.
            The last successful query was:
            {successful_queries[-1]["query"]}"""
        except Exception as e:
            print('FAILED')
            # Catch the error, store the message, and try again
            msg = str(e)
            error_messages.append(msg)
            # Evolve the prompt by appending the error message and asking the model to correct it
            prompt = f"""Encountered an error: {msg}.
To address this, please generate an alternative SQL query response that avoids this specific error.
Follow the instructions mentioned above to remediate the error.

Modify the below SQL query to resolve the issue and ensure it is not a repetition of all previously generated queries.
{generated_sql_query}

Ensure the revised SQL query aligns precisely with the requirements outlined in the initial question.
Additionally, please optimize the query for latency while maintaining correctness and efficiency."""
            prompts.append(prompt)
        print('=' * 100)
        tries += 1
    # If no successful queries
    if len(successful_queries) == 0:
        return {
            "error": "All attempts exhausted.",
            "prompts": prompts,
            "errors": error_messages
        }

    # Sort successful queries by latency
    successful_queries.sort(key=lambda x: x['latency'])

    if return_all:
        df = pd.DataFrame([(q["query"], q["dataframe"], q["latency"]) for q in successful_queries], columns=["Query", "Result", "Latency"])
        return {
            "dataframe": df
        }
    else:
        return {
            "fastest_query": successful_queries[0]["query"],
            "result": successful_queries[0]["dataframe"],
            "latency": successful_queries[0]["latency"]
        }

In [None]:
seed_prompt = """
Please craft a SQL query for BigQuery that addresses the following QUESTION provided below.
Ensure you reference the appropriate BigQuery tables and column names provided in the SCHEMA below.
When joining tables, employ type coercion to guarantee data type consistency for the join columns.
Additionally, the output column names should specify units where applicable.\n
QUESTION:
{}\n
SCHEMA:
{}\n
IMPORTANT:
Use ONLY DATETIME and DO NOT use TIMESTAMP.
--
Ensure your SQL query accurately defines both the start and end of the DATETIME range.
"""
print(seed_prompt)


Please craft a SQL query for BigQuery that addresses the following QUESTION provided below. 
Ensure you reference the appropriate BigQuery tables and column names provided in the SCHEMA below. 
When joining tables, employ type coercion to guarantee data type consistency for the join columns. 
Additionally, the output column names should specify units where applicable.

QUESTION:
{}

SCHEMA:
{}

IMPORTANT: 
Use ONLY DATETIME and DO NOT use TIMESTAMP.
--
Ensure your SQL query accurately defines both the start and end of the DATETIME range.



In [None]:
question = "Provide a list of all the flight reservations from October 10th to October 15th, 2023"

In [None]:
prompt = seed_prompt.format(question, schema_columns)
print(prompt)


Please craft a SQL query for BigQuery that addresses the following QUESTION provided below. 
Ensure you reference the appropriate BigQuery tables and column names provided in the SCHEMA below. 
When joining tables, employ type coercion to guarantee data type consistency for the join columns. 
Additionally, the output column names should specify units where applicable.

QUESTION:
Provide a list of all the flight reservations from October 10th to October 15th, 2023

SCHEMA:
| table_catalog   | table_schema        | table_name   | column_name          | field_path           | data_type   | description   | collation_name   | rounding_mode   |
|:----------------|:--------------------|:-------------|:---------------------|:---------------------|:------------|:--------------|:-----------------|:----------------|
| pradeep-genai   | flight_reservations | transactions | transaction_id       | transaction_id       | INT64       |               | NULL             |                 |
| pradeep-ge

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output


TRIAL: 1
--------------------------------------------------
SELECT
    'Reservation ID: ' || CAST(r.reservation_id AS STRING)              AS reservation_id,
    'Customer ID: '   || CAST(r.customer_id AS STRING)                 AS customer_id,
    'Flight ID: '     || CAST(r.flight_id AS STRING)                    AS flight_id,
    'Departure Date: '|| DATE(f.departure_datetime)                     AS departure_date,
    'Departure Time: '|| TIME(f.departure_datetime)                     AS departure_time,
    'Arrival Date: ' || DATE(f.arrival_datetime)                       AS arrival_date,
    'Arrival Time: ' || TIME(f.arrival_datetime)                       AS arrival_time,
    'Origin: '        || f.origin                                     AS origin,
    'Destination: '   || f.destination                                 AS destination,
    'Carrier: '       || f.carrier                                    AS carrier,
    'Price: '         || CAST(f.price AS STRING)             

Unnamed: 0,Query,Result,Latency
0,"SELECT\n 'Reservation ID: ' || CAST(r.reservation_id AS STRING) AS reservation_id,\n 'Customer ID: ' || CAST(r.customer_id AS STRING) AS customer_id,\n 'Flight ID: ' || CAST(r.flight_id AS STRING) AS flight_id,\n 'Departure Date: ' || DATE(f.departure_datetime) AS departure_date,\n 'Departure Time: ' || TIME(f.departure_datetime) AS departure_time,\n 'Arrival Date: ' || DATE(f.arrival_datetime) AS arrival_date,\n 'Arrival Time: ' || TIME(f.arrival_datetime) AS arrival_time,\n 'Origin: ' || f.origin AS origin,\n 'Destination: ' || f.destination AS destination,\n 'Carrier: ' || f.carrier AS carrier,\n 'Price: ' || CAST(f.price AS STRING) || ' USD' AS price_usd\nFROM\n flight_reservations.reservations AS r\nINNER JOIN\n flight_reservations.flights AS f\nON\n r.flight_id = f.flight_id\nWHERE\n r.reservation_datetime BETWEEN '2023-10-10 00:00:00' AND '2023-10-15 23:59:59'\nORDER BY\n r.reservation_datetime DESC;",reservation_id customer_id flight_id departure_date departure_time arrival_date arrival_time origin destination carrier price_usd 0 Reservation ID: 8 Customer ID: 8 Flight ID: 8 Departure Date: 2023-11-30 Departure Time: 10:00:00 Arrival Date: 2023-11-30 Arrival Time: 13:30:00 Origin: MIA Destination: JFK Carrier: American Price: 380 USD 1 Reservation ID: 7 Customer ID: 6 Flight ID: 7 Departure Date: 2023-11-27 Departure Time: 20:00:00 Arrival Date: 2023-11-27 Arrival Time: 23:30:00 Origin: JFK Destination: MIA Carrier: American Price: 380 USD 2 Reservation ID: 6 Customer ID: 6 Flight ID: 6 Departure Date: 2023-11-25 Departure Time: 06:00:00 Arrival Date: 2023-11-25 Arrival Time: 14:30:00 Origin: SEA Destination: JFK Carrier: United Price: 550 USD,5.055454
1,"WITH RankedReservations AS (\n SELECT\n r.reservation_id,\n r.customer_id,\n r.flight_id,\n r.reservation_datetime,\n ROW_NUMBER() OVER (PARTITION BY r.reservation_id ORDER BY r.reservation_datetime DESC) AS reservation_rank\n FROM\n flight_reservations.reservations AS r\n WHERE\n r.reservation_datetime BETWEEN '2023-10-10 00:00:00' AND '2023-10-15 23:59:59'\n)\n\nSELECT DISTINCT\n 'Reservation ID: ' || CAST(rr.reservation_id AS STRING) AS reservation_id,\n 'Customer ID: ' || CAST(rr.customer_id AS STRING) AS customer_id,\n 'Flight ID: ' || CAST(rr.flight_id AS STRING) AS flight_id,\n DATE(f.departure_datetime) AS departure_date,\n TIME(f.departure_datetime) AS departure_time,\n DATE(f.arrival_datetime) AS arrival_date,\n TIME(f.arrival_datetime) AS arrival_time,\n f.origin AS origin,\n f.destination AS destination,\n f.carrier AS carrier,\n CAST(f.price AS STRING) || ' USD' AS price_usd\nFROM\n RankedReservations AS rr\nJOIN\n flight_reservations.flights AS f\nON\n rr.flight_id = f.flight_id\nWHERE\n rr.reservation_rank = 1;",reservation_id customer_id flight_id departure_date departure_time arrival_date arrival_time origin destination carrier price_usd 0 Reservation ID: 6 Customer ID: 6 Flight ID: 6 2023-11-25 06:00:00 2023-11-25 14:30:00 SEA JFK United 550 USD 1 Reservation ID: 7 Customer ID: 6 Flight ID: 7 2023-11-27 20:00:00 2023-11-27 23:30:00 JFK MIA American 380 USD 2 Reservation ID: 8 Customer ID: 8 Flight ID: 8 2023-11-30 10:00:00 2023-11-30 13:30:00 MIA JFK American 380 USD,6.706999


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df


Unnamed: 0,reservation_id,customer_id,flight_id,departure_date,departure_time,arrival_date,arrival_time,origin,destination,carrier,price_usd
0,Reservation ID: 8,Customer ID: 8,Flight ID: 8,Departure Date: 2023-11-30,Departure Time: 10:00:00,Arrival Date: 2023-11-30,Arrival Time: 13:30:00,Origin: MIA,Destination: JFK,Carrier: American,Price: 380 USD
1,Reservation ID: 7,Customer ID: 6,Flight ID: 7,Departure Date: 2023-11-27,Departure Time: 20:00:00,Arrival Date: 2023-11-27,Arrival Time: 23:30:00,Origin: JFK,Destination: MIA,Carrier: American,Price: 380 USD
2,Reservation ID: 6,Customer ID: 6,Flight ID: 6,Departure Date: 2023-11-25,Departure Time: 06:00:00,Arrival Date: 2023-11-25,Arrival Time: 14:30:00,Origin: SEA,Destination: JFK,Carrier: United,Price: 550 USD


In [None]:
question = "Identify all the customers who have made flight reservations within the last 7 days."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
-- Identify customers who made flight reservations within the last 7 days

-- This query utilizes BigQuery's DATETIME functions to perform date arithmetic
-- for finding transactions within the last 7 days.

WITH RecentTransactions AS (
    SELECT DISTINCT(customer_id)
    FROM flight_reservations.transactions
    WHERE transaction_datetime >= DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY)
)

-- Select customer information for those who made recent transactions
SELECT c.customer_id, c.first_name, c.last_name, c.email
FROM RecentTransactions rt
JOIN flight_reservations.customers c ON rt.customer_id = c.customer_id;
--------------------------------------------------
FAILED
TRIAL: 2
--------------------------------------------------
SELECT DISTINCT C.customer_id, C.first_name, C.last_name, C.email
FROM flight_reservations.Customers C
JOIN flight_reservations.Reservations R ON C.customer_id = R.customer_id
WHERE R.reservation_dateti

Unnamed: 0,Query,Result,Latency
0,"WITH RecentReservations AS (\n SELECT DISTINCT customer_id\n FROM `flight_reservations.reservations`\n WHERE reservation_datetime >= DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY)\n)\n\nSELECT *\nFROM `flight_reservations.customers`\nWHERE customer_id IN (SELECT customer_id FROM RecentReservations);","Empty DataFrame Columns: [customer_id, first_name, last_name, email, date_of_birth, created_at, age_group] Index: []",4.169661
1,"-- Identify customers with reservations in the last 7 days\n\n-- Create a common table expression (CTE) to find reservation IDs for the past week.\nWITH RecentReservations AS (\n SELECT DISTINCT reservation_id\n FROM flight_reservations.reservations\n WHERE reservation_datetime >= DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY)\n)\n\n-- Select customer details for customers with recent reservations.\nSELECT c.customer_id, c.first_name, c.last_name, c.email\nFROM flight_reservations.customers AS c\nJOIN RecentReservations AS rr ON c.customer_id = rr.reservation_id;","Empty DataFrame Columns: [customer_id, first_name, last_name, email] Index: []",4.358016


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,customer_id,first_name,last_name,email,date_of_birth,created_at,age_group


In [None]:
question = "Calculate the total revenue generated from transactions in October 2023, specifically from all reservations with a Confirmed status."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
WITH MonthlyConfirmedTransactions AS (
    SELECT
        t.transaction_id,
        t.amount,
        t.transaction_datetime
    FROM
        flight_reservations.transactions t
    JOIN
        flight_reservations.reservations r
    ON
        CAST(t.reservation_id AS STRING) = CAST(r.reservation_id AS STRING)
    WHERE
        CAST(DATE(t.transaction_datetime) AS DATE) BETWEEN DATE('2023-10-01') AND DATE('2023-10-31') 
        AND r.status = 'Confirmed'
)

SELECT
    SUM(MonthlyConfirmedTransactions.amount) AS TotalRevenueInUSD
FROM
    MonthlyConfirmedTransactions;
--------------------------------------------------
SUCCEEDED
TRIAL: 2
--------------------------------------------------
WITH MonthlyConfirmedTransactions AS (
    SELECT
        t.transaction_id,
        t.amount,
        t.transaction_datetime
    FROM
        flight_reservations.transactions t
    JOIN
        flight_reservations.reservations r
    ON
        C

Unnamed: 0,Query,Result,Latency
0,-- Apply appropriate indexes on the transaction_datetime and status columns to optimize performance.\nSELECT\n SUM(t.amount) AS TotalRevenueInUSD\nFROM\n flight_reservations.transactions t\nJOIN\n flight_reservations.reservations r\nON\n t.reservation_id = r.reservation_id -- Use numeric comparison instead of string casting\nWHERE\n DATE(t.transaction_datetime) BETWEEN DATE('2023-10-01') AND DATE('2023-10-31') \n AND r.status = 'Confirmed';,TotalRevenueInUSD 0 3860.0,3.994663
1,"WITH MonthlyConfirmedTransactions AS (\n SELECT\n t.transaction_id,\n t.amount,\n t.transaction_datetime\n FROM\n flight_reservations.transactions t\n JOIN\n flight_reservations.reservations r\n ON\n CAST(t.reservation_id AS STRING) = CAST(r.reservation_id AS STRING)\n WHERE\n CAST(DATE(t.transaction_datetime) AS DATE) BETWEEN '2023-10-01' AND '2023-10-31'\n AND r.status = 'Confirmed'\n)\n\nSELECT\n SUM(MonthlyConfirmedTransactions.amount) AS TotalRevenueInUSD\nFROM\n MonthlyConfirmedTransactions;",TotalRevenueInUSD 0 3860.0,4.397673
2,SELECT\n SUM(t.amount) AS TotalRevenueInUSD\nFROM\n flight_reservations.transactions t\nJOIN\n flight_reservations.reservations r\nON\n CAST(t.reservation_id AS STRING) = CAST(r.reservation_id AS STRING)\nWHERE\n DATE(t.transaction_datetime) BETWEEN DATE('2023-10-01') AND DATE('2023-10-31') \n AND r.status = 'Confirmed';,TotalRevenueInUSD 0 3860.0,4.613995
3,"WITH MonthlyConfirmedTransactions AS (\n SELECT\n t.transaction_id,\n t.amount,\n t.transaction_datetime\n FROM\n flight_reservations.transactions t\n JOIN\n flight_reservations.reservations r\n ON\n CAST(t.reservation_id AS STRING) = CAST(r.reservation_id AS STRING)\n WHERE\n CAST(DATE(t.transaction_datetime) AS DATE) BETWEEN DATE('2023-10-01') AND DATE('2023-10-31') \n AND r.status = 'Confirmed'\n)\n\nSELECT\n SUM(MonthlyConfirmedTransactions.amount) AS TotalRevenueInUSD\nFROM\n MonthlyConfirmedTransactions;",TotalRevenueInUSD 0 3860.0,4.904417
4,"-- Create a common table expression (CTE) named MonthlyConfirmedTransactions to optimize performance.\nWITH MonthlyConfirmedTransactions AS (\n SELECT\n transaction_id,\n amount,\n transaction_datetime\n FROM\n flight_reservations.transactions t\n JOIN\n flight_reservations.reservations r\n ON\n CAST(t.reservation_id AS STRING) = CAST(r.reservation_id AS STRING)\n WHERE\n DATE(transaction_datetime) BETWEEN DATE('2023-10-01') AND DATE('2023-10-31') \n AND r.status = 'Confirmed'\n)\n\n-- Select the total revenue from confirmed transactions in October 2023.\nSELECT\n SUM(amount) AS TotalRevenueInUSD\nFROM\n MonthlyConfirmedTransactions;",TotalRevenueInUSD 0 3860.0,5.09682


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,TotalRevenueInUSD
0,3860.0


In [None]:
question = "Determine departure months with the highest frequency for the year 2023."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
WITH MonthlyDepartureCounts AS (
  SELECT
    SUBSTRING(departure_datetime, 1, 7) AS departure_month,  -- Extract the year-month from the departure_datetime column
    COUNT(*) AS departure_count
  FROM
    flights
  WHERE
    SUBSTR(departure_datetime, 1, 4) = '2023'  -- Filter flights for the year 2023
  GROUP BY
    departure_month  -- Group the flights by year-month
)
SELECT
  departure_month,
  departure_count
FROM
  MonthlyDepartureCounts
ORDER BY
  departure_count DESC;  -- Order the results by departure count in descending order;
--------------------------------------------------
FAILED
TRIAL: 2
--------------------------------------------------
--WITH MonthlyDepartureCounts AS (
SELECT
  SUBSTR(f.departure_datetime, 1, 7) AS departure_month,  -- Extract the year-month from the departure_datetime column
  COUNT(*) AS departure_count
FROM
  `pradeep-genai.flight_reservations.flights` AS f  -- Fully qualify the 'flights'

Unnamed: 0,Query,Result,Latency
0,"SELECT\n DATE_TRUNC(f.departure_datetime, MONTH) AS departure_month, -- Extract the year-month from the departure_datetime column\n COUNT(*) AS departure_count\nFROM\n `pradeep-genai.flight_reservations.flights` AS f -- Fully qualify the 'flights' table with the project ID and dataset ID\nWHERE\n EXTRACT(YEAR FROM f.departure_datetime) = 2023 -- Filter flights for the year 2023\nGROUP BY\n departure_month -- Group the flights by year-month\nORDER BY\n departure_count DESC; -- Order the results by departure count in descending order;",departure_month departure_count 0 2023-11-01 8 1 2023-12-01 8,3.850192


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,departure_month,departure_count
0,2023-11-01,8
1,2023-12-01,8


In [None]:
question = "Group customers into five distinct age brackets and count the number of customers in each bracket."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
WITH AgeGroupedCustomers AS (
  SELECT
    CASE
      WHEN DATE_DIFF(CURRENT_DATE(), date_of_birth, YEAR) < 20 THEN '0-20'
      WHEN DATE_DIFF(CURRENT_DATE(), date_of_birth, YEAR) < 40 THEN '21-39'
      WHEN DATE_DIFF(CURRENT_DATE(), date_of_birth, YEAR) < 60 THEN '40-59'
      WHEN DATE_DIFF(CURRENT_DATE(), date_of_birth, YEAR) < 80 THEN '60-79'
      ELSE '80+'
    END AS age_group,
    COUNT(DISTINCT customer_id) AS customer_count
  FROM
    customers
  GROUP BY
    age_group
)
SELECT
  age_group,
  customer_count
FROM
  AgeGroupedCustomers
ORDER BY
  age_group;
--------------------------------------------------
FAILED
TRIAL: 2
--------------------------------------------------
WITH AgeGroupedCustomers AS (
SELECT
  CASE
    WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 20 THEN '0-20'
    WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 40 THEN '21-39'
    WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, Y

Unnamed: 0,Query,Result,Latency
0,"WITH AgeGroupedCustomers AS (\nSELECT\n CASE\n WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 20 THEN '0-20'\n WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 40 THEN '21-39'\n WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 60 THEN '40-59'\n WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 80 THEN '60-79'\n ELSE '80+'\n END AS age_group,\n COUNT(DISTINCT c.customer_id) AS customer_count\nFROM\n flight_reservations.customers c\nGROUP BY\n age_group\n)\nSELECT\n age_group,\n customer_count\nFROM\n AgeGroupedCustomers\nORDER BY\n age_group;",age_group customer_count 0 21-39 9 1 40-59 7 2 60-79 3 3 80+ 1,4.404687
1,"SELECT\n CASE\n WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 20 THEN '0-20'\n WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 40 THEN '21-39'\n WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 60 THEN '40-59'\n WHEN DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) < 80 THEN '60-79'\n ELSE '80+'\n END AS age_group,\n COUNT(DISTINCT c.customer_id) AS customer_count\nFROM\n `flight_reservations.customers` c\nGROUP BY\n age_group\nORDER BY\n age_group;",age_group customer_count 0 21-39 9 1 40-59 7 2 60-79 3 3 80+ 1,4.641061


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,age_group,customer_count
0,21-39,9
1,40-59,7
2,60-79,3
3,80+,1


In [None]:
question = "Identify and rank all the customers aged 18+ who have `Confirmed` reservations for the current month, ordered by their age. Make sure to display their ages in the result."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
-- This BigQuery SQL query identifies and ranks customers aged 18+ with confirmed reservations for the current month, ordered by age.

-- Common Table Expression (CTE) to calculate customer ages
WITH AgeCalculation AS (
    SELECT
        c.customer_id,
        DATE_DIFF("2023-08-31", c.date_of_birth, YEAR) AS age
    FROM
        customers c
)

-- Select and filter customers based on age and reservation details
SELECT
    ac.customer_id,
    ac.age,
    c.first_name,
    c.last_name,
    COUNT(t.reservation_id) AS reservation_count
FROM
    customers c
JOIN
    AgeCalculation ac ON c.customer_id = ac.customer_id
JOIN
    reservations r ON c.customer_id = r.customer_id
JOIN
    transactions t ON r.reservation_id = t.reservation_id
WHERE
    -- Converting datetime to DATE to ignore time component and then comparing date
    CAST(t.transaction_datetime AS DATE) BETWEEN DATE '2023-08-01' AND DATE '2023-08-31'
    AND r.status = '

Unnamed: 0,Query,Result,Latency
0,"-- This optimized BigQuery SQL query addresses the error associated with unqualified table references and provides precise information about customers aged 18+ with confirmed reservations for the current month.\n\n-- Common Table Expression (CTE) to calculate customer details\nWITH CustomerDetails AS (\n SELECT\n c.customer_id,\n c.first_name,\n c.last_name,\n DATE_DIFF(""2023-08-31"", c.date_of_birth, YEAR) AS age\n FROM\n flight_reservations.customers c\n),\n\n-- Subquery to aggregate reservation counts\nReservationCounts AS (\n SELECT\n r.customer_id,\n COUNT(1) AS reservation_count\n FROM\n flight_reservations.reservations r\n WHERE\n CAST(r.reservation_datetime AS DATE) BETWEEN DATE '2023-08-01' AND DATE '2023-08-31'\n AND r.status = 'Confirmed'\n GROUP BY\n r.customer_id\n)\n\n-- Main query to retrieve customer data with reservation counts\nSELECT\n cd.customer_id,\n cd.first_name,\n cd.last_name,\n cd.age,\n COALESCE(rc.reservation_count, 0) AS reservation_count\nFROM\n CustomerDetails cd\nLEFT JOIN\n ReservationCounts rc ON cd.customer_id = rc.customer_id\nWHERE\n cd.age >= 18\nORDER BY\n cd.age DESC;",customer_id first_name last_name age reservation_count 0 19 Queen Elizabeth 97 0 1 17 Olivia Newton 75 0 2 14 Liam Neeson 71 0 3 12 Jack Sparrow 60 0 4 16 Nick Fury 58 0 5 9 Gary Oldman 53 0 6 7 Ethan Hunt 48 0 7 13 Kate Winslet 48 0 8 20 Ryan Reynolds 47 0 9 11 Ian Somerhalder 45 0 10 4 Bob Smith 41 0 11 1 John Doe 38 0 12 2 Jane Doe 36 0 13 6 Diana Prince 35 0 14 3 Alice Johnson 33 0 15 8 Fiona Shrek 31 0 16 15 Mary Jane 30 0 17 5 Charlie Brown 28 0 18 10 Hannah Montana 25 0 19 18 Peter Parker 22 0,5.697976
1,"-- This optimized BigQuery SQL query addresses the error associated with unqualified table references and provides precise information about customers aged 18+ with confirmed reservations for the current month.\n\n-- Common Table Expression (CTE) to calculate customer details\nWITH CustomerDetails AS (\n SELECT\n c.customer_id,\n c.first_name,\n c.last_name,\n CAST(c.date_of_birth AS DATE) AS date_of_birth,\n DATE_DIFF(""2023-08-31"", c.date_of_birth, YEAR) AS age\n FROM\n flight_reservations.customers c\n),\n\n-- Subquery to aggregate reservation counts\nReservationCounts AS (\n SELECT\n r.customer_id,\n COUNT(1) AS reservation_count\n FROM\n flight_reservations.reservations r\n WHERE\n DATE(r.reservation_datetime) BETWEEN DATE '2023-08-01' AND DATE '2023-08-31' -- Use DATE() to extract the date part\n AND r.status = 'Confirmed'\n GROUP BY\n r.customer_id\n)\n\n-- Main query to retrieve customer data with reservation counts\nSELECT\n cd.customer_id,\n cd.first_name,\n cd.last_name,\n cd.age,\n COALESCE(rc.reservation_count, 0) AS reservation_count\nFROM\n CustomerDetails cd\nLEFT JOIN\n ReservationCounts rc ON cd.customer_id = rc.customer_id\nWHERE\n cd.date_of_birth BETWEEN DATE '1905-08-31' AND DATE '2005-08-31' -- Calculate the date range for customers aged 18+ as of August 2023\nORDER BY\n cd.age DESC;",customer_id first_name last_name age reservation_count 0 19 Queen Elizabeth 97 0 1 17 Olivia Newton 75 0 2 14 Liam Neeson 71 0 3 12 Jack Sparrow 60 0 4 16 Nick Fury 58 0 5 9 Gary Oldman 53 0 6 7 Ethan Hunt 48 0 7 13 Kate Winslet 48 0 8 20 Ryan Reynolds 47 0 9 11 Ian Somerhalder 45 0 10 4 Bob Smith 41 0 11 1 John Doe 38 0 12 2 Jane Doe 36 0 13 6 Diana Prince 35 0 14 3 Alice Johnson 33 0 15 8 Fiona Shrek 31 0 16 15 Mary Jane 30 0 17 5 Charlie Brown 28 0 18 10 Hannah Montana 25 0 19 18 Peter Parker 22 0,5.800199
2,"-- Optimized SQL query for retrieving customers aged 18+ with confirmed reservations for the current month, prioritizing latency.\n\n-- Common Table Expression (CTE) to pre-calculate customer details\nWITH CustomerDetails AS (\n SELECT\n customer_id,\n first_name,\n last_name,\n CAST(date_of_birth AS DATE) AS date_of_birth,\n DATE_DIFF(""2023-08-31"", date_of_birth, YEAR) AS age\n FROM\n flight_reservations.customers\n)\n\n-- Main query to retrieve customer details and reservation counts\nSELECT\n cd.customer_id,\n cd.first_name,\n cd.last_name,\n cd.age,\n COALESCE(rc.reservation_count, 0) AS reservation_count\nFROM\n CustomerDetails cd\nLEFT JOIN (\n -- Subquery to count reservations for the current month\n SELECT\n customer_id,\n COUNT(*) AS reservation_count\n FROM\n flight_reservations.reservations\n WHERE\n CAST(reservation_datetime AS DATE) BETWEEN DATE '2023-08-01' AND DATE '2023-08-31'\n AND status = 'Confirmed'\n GROUP BY\n customer_id\n) rc ON cd.customer_id = rc.customer_id\nWHERE\n cd.date_of_birth BETWEEN DATE '1905-08-31' AND DATE '2005-08-31' -- Calculate the date range for customers aged 18+ as of August 2023\nORDER BY\n cd.age DESC;",customer_id first_name last_name age reservation_count 0 19 Queen Elizabeth 97 0 1 17 Olivia Newton 75 0 2 14 Liam Neeson 71 0 3 12 Jack Sparrow 60 0 4 16 Nick Fury 58 0 5 9 Gary Oldman 53 0 6 7 Ethan Hunt 48 0 7 13 Kate Winslet 48 0 8 20 Ryan Reynolds 47 0 9 11 Ian Somerhalder 45 0 10 4 Bob Smith 41 0 11 1 John Doe 38 0 12 2 Jane Doe 36 0 13 6 Diana Prince 35 0 14 3 Alice Johnson 33 0 15 8 Fiona Shrek 31 0 16 15 Mary Jane 30 0 17 5 Charlie Brown 28 0 18 10 Hannah Montana 25 0 19 18 Peter Parker 22 0,5.845005


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,customer_id,first_name,last_name,age,reservation_count
0,19,Queen,Elizabeth,97,0
1,17,Olivia,Newton,75,0
2,14,Liam,Neeson,71,0
3,12,Jack,Sparrow,60,0
4,16,Nick,Fury,58,0
5,9,Gary,Oldman,53,0
6,7,Ethan,Hunt,48,0
7,13,Kate,Winslet,48,0
8,20,Ryan,Reynolds,47,0
9,11,Ian,Somerhalder,45,0
