In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from vertexai.language_models import CodeGenerationModel
from google.cloud import bigquery
import vertexai
import pandas as pd
import time
import os

In [None]:
# Adjust display settings
pd.set_option('display.max_colwidth', None)  # Set max column width to None to show all content
pd.set_option('display.expand_frame_repr', False)  # Prevent truncation of DataFrame HTML representation


In [None]:
PROJECT_ID = 'pradeep-genai'
CODE_GEN_MODEL_NAME = 'code-bison'
TEMPERATURE = 1
MAX_OUTPUT_TOKENS = 2048  # length of the output response | overridding the default value which is 128
# TOP_P = 0.95  # default value
# TOP_K = 40  # default value
LOCATION = 'us-central1'

In [None]:
DATASET = 'flight_reservations'
TABLES = ['customers', 'flights', 'reservations', 'transactions', 'loyality_points']

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID)
vertexai.init(project=PROJECT_ID, location='us-central1')
code_gen_model = CodeGenerationModel.from_pretrained(model_name=CODE_GEN_MODEL_NAME)

In [None]:
query = f"""
    SELECT *
    FROM `{PROJECT_ID}.{DATASET}.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE table_name in ({','.join([f'"{table}"' for table in TABLES])})
"""
print(query)


    SELECT *
    FROM `pradeep-genai.flight_reservations.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE table_name in ("customers","flights","reservations","transactions","loyality_points")



In [None]:
schema_columns = bq_client.query(query=query).to_dataframe()
schema_columns

Unnamed: 0,table_catalog,table_schema,table_name,column_name,field_path,data_type,description,collation_name,rounding_mode
0,pradeep-genai,flight_reservations,transactions,transaction_id,transaction_id,INT64,,,
1,pradeep-genai,flight_reservations,transactions,reservation_id,reservation_id,INT64,,,
2,pradeep-genai,flight_reservations,transactions,amount,amount,FLOAT64,,,
3,pradeep-genai,flight_reservations,transactions,transaction_datetime,transaction_datetime,DATETIME,,,
4,pradeep-genai,flight_reservations,reservations,reservation_id,reservation_id,INT64,,,
5,pradeep-genai,flight_reservations,reservations,customer_id,customer_id,INT64,,,
6,pradeep-genai,flight_reservations,reservations,flight_id,flight_id,INT64,,,
7,pradeep-genai,flight_reservations,reservations,reservation_datetime,reservation_datetime,DATETIME,,,
8,pradeep-genai,flight_reservations,reservations,status,status,STRING,,,
9,pradeep-genai,flight_reservations,flights,flight_id,flight_id,INT64,,,


In [None]:
schema_columns = schema_columns.to_markdown(index=False)
print(schema_columns)

| table_catalog   | table_schema        | table_name   | column_name          | field_path           | data_type   | description   | collation_name   | rounding_mode   |
|:----------------|:--------------------|:-------------|:---------------------|:---------------------|:------------|:--------------|:-----------------|:----------------|
| pradeep-genai   | flight_reservations | transactions | transaction_id       | transaction_id       | INT64       |               | NULL             |                 |
| pradeep-genai   | flight_reservations | transactions | reservation_id       | reservation_id       | INT64       |               | NULL             |                 |
| pradeep-genai   | flight_reservations | transactions | amount               | amount               | FLOAT64     |               | NULL             |                 |
| pradeep-genai   | flight_reservations | transactions | transaction_datetime | transaction_datetime | DATETIME    |               | NULL             

In [None]:
def generate_and_execute_sql(prompt, max_tries=5, return_all=False):
    """
    Generate an SQL query using the code_gen_model, execute it using bq_client, and rank successful queries by latency.

    Args:
    - prompt (str): Prompt to provide to the model for generating SQL.
    - max_tries (int): Maximum number of attempts to generate and execute SQL.
    - return_all (bool): Flag to determine whether to return all successful queries or only the fastest.

    Returns:
    - dict: A dictionary containing the fastest dataframe or all successful dataframes, or error messages and prompt evolution.
    """

    tries = 0
    error_messages = []
    prompts = [prompt]
    successful_queries = []

    while tries < max_tries:
        print(f'TRIAL: {tries+1}')
        try:
            # Predict SQL using the model
            start_time = time.time()
            response = code_gen_model.predict(prompt, temperature=TEMPERATURE, max_output_tokens=MAX_OUTPUT_TOKENS)
            generated_sql_query = response.text
            generated_sql_query = '\n'.join(generated_sql_query.split('\n')[1:-1])
            print('-' * 50)
            print(generated_sql_query)
            print('-' * 50)
            # Execute SQL using BigQuery client
            df = bq_client.query(generated_sql_query).to_dataframe()
            latency = time.time() - start_time
            successful_queries.append({
                "query": generated_sql_query,
                "dataframe": df,
                "latency": latency
            })
            print('SUCCEEDED')
            # Evolve the prompt for success path to optimize the last successful query for latency
            if len(successful_queries) > 1:
                prompt = f"""Modify the last successful SQL query by making changes to it and optimizing it for latency.
            ENSURE that the NEW QUERY is DIFFERENT from the previous one while prioritizing faster execution.
            The last successful query was:
            {successful_queries[-1]["query"]}"""
        except Exception as e:
            print('FAILED')
            # Catch the error, store the message, and try again
            msg = str(e)
            error_messages.append(msg)
            # Evolve the prompt by appending the error message and asking the model to correct it
            prompt = f"""{prompt}
Encountered an error: {msg}.
To address this, please generate an alternative SQL query response that avoids this specific error.
Follow the instructions mentioned above to remediate the error.

Modify the below SQL query to resolve the issue and ensure it is not a repetition of the previously generated query.
{generated_sql_query}

Ensure the revised SQL query aligns precisely with the requirements outlined in the initial question.
Additionally, please optimize the query for latency while maintaining correctness and efficiency."""
            prompts.append(prompt)
        print('=' * 100)
        tries += 1
    # If no successful queries
    if len(successful_queries) == 0:
        return {
            "error": "All attempts exhausted.",
            "prompts": prompts,
            "errors": error_messages
        }

    # Sort successful queries by latency
    successful_queries.sort(key=lambda x: x['latency'])

    if return_all:
        df = pd.DataFrame([(q["query"], q["dataframe"], q["latency"]) for q in successful_queries], columns=["Query", "Result", "Latency"])
        return {
            "dataframe": df
        }
    else:
        return {
            "fastest_query": successful_queries[0]["query"],
            "result": successful_queries[0]["dataframe"],
            "latency": successful_queries[0]["latency"]
        }

In [None]:
seed_prompt = """
Please craft a SQL query for BigQuery that addresses the following QUESTION provided below.
Ensure you reference the appropriate BigQuery tables and column names provided in the SCHEMA below.
When joining tables, employ type coercion to guarantee data type consistency for the join columns.
Additionally, the output column names should specify units where applicable.\n
QUESTION:
{}\n
SCHEMA:
{}\n
IMPORTANT:
Use ONLY DATETIME and DO NOT use TIMESTAMP.
--
Ensure your SQL query accurately defines both the start and end of the DATETIME range.
"""
print(seed_prompt)


Please craft a SQL query for BigQuery that addresses the following QUESTION provided below. 
Ensure you reference the appropriate BigQuery tables and column names provided in the SCHEMA below. 
When joining tables, employ type coercion to guarantee data type consistency for the join columns. 
Additionally, the output column names should specify units where applicable.

QUESTION:
{}

SCHEMA:
{}

IMPORTANT: 
Use ONLY DATETIME and DO NOT use TIMESTAMP.
--
Ensure your SQL query accurately defines both the start and end of the DATETIME range.



In [None]:
question = "Provide a list of all flight reservations from October 10th to October 15th, 2023"

In [None]:
prompt = seed_prompt.format(question, schema_columns)
print(prompt)


Please craft a SQL query for BigQuery that addresses the following QUESTION provided below. 
Ensure you reference the appropriate BigQuery tables and column names provided in the SCHEMA below. 
When joining tables, employ type coercion to guarantee data type consistency for the join columns. 
Additionally, the output column names should specify units where applicable.

QUESTION:
Provide a list of all flight reservations from October 10th to October 15th, 2023

SCHEMA:
| table_catalog   | table_schema        | table_name   | column_name          | field_path           | data_type   | description   | collation_name   | rounding_mode   |
|:----------------|:--------------------|:-------------|:---------------------|:---------------------|:------------|:--------------|:-----------------|:----------------|
| pradeep-genai   | flight_reservations | transactions | transaction_id       | transaction_id       | INT64       |               | NULL             |                 |
| pradeep-genai 

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output


TRIAL: 1
--------------------------------------------------
WITH FilteredTransactions AS (
    SELECT
        t.reservation_id
    FROM
        transactions AS t
    JOIN
        reservations AS r ON t.reservation_id = r.reservation_id
    WHERE
        t.transaction_datetime BETWEEN '2023-10-10 00:00:00' AND '2023-10-15 23:59:59' AND
        r.status = 'Confirmed'
)
SELECT
    r.reservation_id,
    r.flight_id,
    r.customer_id,
    r.reservation_datetime,
    r.status,
    f.origin,
    f.destination,
    f.departure_datetime,
    f.arrival_datetime,
    f.carrier,
    f.price
FROM
    reservations AS r
JOIN
    flights AS f ON r.flight_id = f.flight_id
WHERE
    r.reservation_id IN (SELECT reservation_id FROM FilteredTransactions);
--------------------------------------------------
FAILED
TRIAL: 2
--------------------------------------------------
WITH FilteredTransactions AS (
    SELECT
        t.reservation_id
    FROM
        `flight_reservations.transactions` AS t
    JOIN
   

Unnamed: 0,Query,Result,Latency
0,"WITH FilteredTransactions AS (\n SELECT\n t.reservation_id\n FROM\n `flight_reservations.transactions` AS t\n JOIN\n `flight_reservations.reservations` AS r ON t.reservation_id = r.reservation_id\n WHERE\n CAST(t.transaction_datetime AS DATE) BETWEEN '2023-10-10' AND '2023-10-15' AND\n r.status = 'Confirmed'\n)\nSELECT\n r.reservation_id,\n r.flight_id,\n r.customer_id,\n r.reservation_datetime,\n r.status,\n f.origin,\n f.destination,\n f.departure_datetime,\n f.arrival_datetime,\n f.carrier,\n f.price\nFROM\n `flight_reservations.reservations` AS r\nJOIN\n `flight_reservations.flights` AS f ON r.flight_id = f.flight_id\nWHERE\n r.reservation_id IN (SELECT reservation_id FROM FilteredTransactions);",reservation_id flight_id customer_id reservation_datetime status origin destination departure_datetime arrival_datetime carrier price 0 6 6 6 2023-10-10 10:00:00 Confirmed SEA JFK 2023-11-25 06:00:00 2023-11-25 14:30:00 United 550.0 1 7 7 6 2023-10-12 11:30:00 Confirmed JFK MIA 2023-11-27 20:00:00 2023-11-27 23:30:00 American 380.0 2 8 8 8 2023-10-15 13:20:00 Confirmed MIA JFK 2023-11-30 10:00:00 2023-11-30 13:30:00 American 380.0,3.997611
1,"WITH FilteredTransactions AS (\n SELECT\n t.reservation_id\n FROM\n `flight_reservations.transactions` AS t\n JOIN\n `flight_reservations.reservations` AS r ON t.reservation_id = r.reservation_id\n WHERE\n CAST(t.transaction_datetime AS DATE) BETWEEN '2023-10-10' AND '2023-10-15' AND\n r.status = 'Confirmed'\n)\nSELECT\n r.reservation_id,\n r.flight_id,\n r.customer_id,\n r.reservation_datetime,\n r.status,\n f.origin,\n f.destination,\n f.departure_datetime,\n f.arrival_datetime,\n f.carrier,\n f.price\nFROM\n `flight_reservations.reservations` AS r\nJOIN\n `flight_reservations.flights` AS f ON r.flight_id = f.flight_id\nWHERE\n r.reservation_id IN (SELECT reservation_id FROM FilteredTransactions);",reservation_id flight_id customer_id reservation_datetime status origin destination departure_datetime arrival_datetime carrier price 0 6 6 6 2023-10-10 10:00:00 Confirmed SEA JFK 2023-11-25 06:00:00 2023-11-25 14:30:00 United 550.0 1 7 7 6 2023-10-12 11:30:00 Confirmed JFK MIA 2023-11-27 20:00:00 2023-11-27 23:30:00 American 380.0 2 8 8 8 2023-10-15 13:20:00 Confirmed MIA JFK 2023-11-30 10:00:00 2023-11-30 13:30:00 American 380.0,4.388936


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df


Unnamed: 0,reservation_id,flight_id,customer_id,reservation_datetime,status,origin,destination,departure_datetime,arrival_datetime,carrier,price
0,6,6,6,2023-10-10 10:00:00,Confirmed,SEA,JFK,2023-11-25 06:00:00,2023-11-25 14:30:00,United,550.0
1,7,7,6,2023-10-12 11:30:00,Confirmed,JFK,MIA,2023-11-27 20:00:00,2023-11-27 23:30:00,American,380.0
2,8,8,8,2023-10-15 13:20:00,Confirmed,MIA,JFK,2023-11-30 10:00:00,2023-11-30 13:30:00,American,380.0


In [None]:
question = "Identify all customers who have made flight reservations within the last 7 days."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
SELECT DISTINCT
  c.customer_id,
  c.first_name,
  c.last_name,
  c.email
FROM
  Customers c
JOIN
  Reservations r ON c.customer_id = r.customer_id
WHERE
  r.reservation_datetime BETWEEN DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY) 
                             AND CURRENT_DATETIME();
--------------------------------------------------
FAILED
TRIAL: 2
--------------------------------------------------
-- Identify customers who have made flight reservations within the last 7 days

SELECT DISTINCT
  c.customer_id,
  c.first_name,
  c.last_name,
  c.email
FROM
  flight_reservations.customers c
JOIN
  flight_reservations.reservations r ON c.customer_id = r.customer_id
WHERE
  r.reservation_datetime BETWEEN DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY) 
                             AND CURRENT_DATETIME();
--------------------------------------------------
SUCCEEDED
TRIAL: 3
--------------------------------------------------
SELECT DI

Unnamed: 0,Query,Result,Latency
0,"-- Identify customers who have made flight reservations within the last 7 days\n\nSELECT DISTINCT\n c.customer_id,\n c.first_name,\n c.last_name,\n c.email\nFROM\n flight_reservations.customers c\nJOIN\n flight_reservations.reservations r ON c.customer_id = r.customer_id\nWHERE\n r.reservation_datetime BETWEEN DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY) \n AND CURRENT_DATETIME();","Empty DataFrame Columns: [customer_id, first_name, last_name, email] Index: []",3.995012
1,"SELECT DISTINCT \n c.customer_id,\n c.first_name,\n c.last_name,\n c.email\nFROM \n flight_reservations.customers AS c\nJOIN \n flight_reservations.reservations AS r ON c.customer_id = r.customer_id\nWHERE \n CAST(r.reservation_datetime AS DATE) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) \n AND CURRENT_DATE();","Empty DataFrame Columns: [customer_id, first_name, last_name, email] Index: []",4.247608
2,"SELECT DISTINCT \n c.customer_id,\n c.first_name,\n c.last_name,\n c.email\nFROM \n flight_reservations.customers AS c\nINNER JOIN \n flight_reservations.reservations AS r ON c.customer_id = r.customer_id\nWHERE \n CAST(r.reservation_datetime AS DATE) BETWEEN '2023-03-08' -- Replace with constant or stored procedure\n AND CURRENT_DATE()\nLIMIT 100; -- Specify the maximum number of rows to return",customer_id first_name last_name email 0 1 John Doe john.doe@example.com 1 2 Jane Doe jane.doe@example.com 2 3 Alice Johnson alice.j@example.com 3 6 Diana Prince diana.p@example.com 4 8 Fiona Shrek fiona.s@example.com 5 10 Hannah Montana hannah.m@example.com 6 11 Ian Somerhalder ian.s@example.com 7 13 Kate Winslet kate.w@example.com 8 15 Mary Jane mary.j@example.com 9 16 Nick Fury nick.f@example.com 10 17 Olivia Newton olivia.n@example.com 11 18 Peter Parker peter.p@example.com 12 19 Queen Elizabeth queen.e@example.com 13 20 Ryan Reynolds ryan.r@example.com,6.522063


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,customer_id,first_name,last_name,email


In [None]:
question = "Calculate the total revenue generated from transactions in October 2023, specifically from all reservations with a Confirmed status."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
SELECT
  SUM(t.amount) AS total_revenue_usd
FROM
  transactions AS t
JOIN
  reservations AS r
ON
  CAST(t.reservation_id AS STRING) = CAST(r.reservation_id AS STRING)
WHERE
  r.status = 'Confirmed'
  AND DATE(t.transaction_datetime) BETWEEN DATE('2023-10-01') AND DATE('2023-10-31');
--------------------------------------------------
FAILED
TRIAL: 2
--------------------------------------------------
SELECT SUM(t.amount) AS total_revenue_usd
FROM flight_reservations.transactions AS t
JOIN flight_reservations.reservations AS r
ON t.reservation_id = r.reservation_id
WHERE r.status = 'Confirmed'
  AND DATE(t.transaction_datetime) BETWEEN DATE('2023-10-01') AND DATE('2023-10-31');
--------------------------------------------------
SUCCEEDED
TRIAL: 3
--------------------------------------------------
-- Calculate the total revenue generated from confirmed reservations in October 2023

SELECT
  SUM(t.amount) AS total_revenue_usd  -- C

Unnamed: 0,Query,Result,Latency
0,SELECT SUM(t.amount) AS total_revenue_usd\nFROM flight_reservations.transactions AS t\nJOIN flight_reservations.reservations AS r\nON t.reservation_id = r.reservation_id\nWHERE r.status = 'Confirmed'\n AND DATE(t.transaction_datetime) BETWEEN DATE('2023-10-01') AND DATE('2023-10-31');,total_revenue_usd 0 3860.0,4.150079
1,"SELECT\n SUM(t.amount) AS total_revenue_usd -- Calculate the sum of transaction amounts\nFROM\n `pradeep-genai.flight_reservations.transactions` AS t -- Specify the fully qualified table name for ""transactions""\nJOIN\n `pradeep-genai.flight_reservations.reservations` AS r -- Specify the fully qualified table name for ""reservations""\nON\n CAST(t.reservation_id AS STRING) = CAST(r.reservation_id AS STRING) -- Join the two tables on the reservation ID\nWHERE\n r.status = 'Confirmed' -- Filter the reservations by the ""Confirmed"" status\n AND DATE(t.transaction_datetime) BETWEEN DATE('2023-10-01') AND DATE('2023-10-31'); -- Filter the transactions by date range (October 2023);",total_revenue_usd 0 3860.0,4.523736
2,SELECT\n SUM(t.amount) AS total_revenue_usd\nFROM\n `pradeep-genai.flight_reservations.transactions` AS t\nJOIN\n `pradeep-genai.flight_reservations.reservations` AS r\nON\n t.reservation_id = r.reservation_id\nWHERE\n r.status = 'Confirmed'\n AND t.transaction_datetime BETWEEN '2023-10-01' AND '2023-10-31';,total_revenue_usd 0 3860.0,6.40294


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,total_revenue_usd
0,3860.0


In [None]:
question = "Determine the departure months with the highest frequency for the year 2023."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
-- Calculate the number of reservations made in each month of the year 2023
WITH MonthlyReservationCounts AS (
    SELECT
        -- Extract the month from the departure_datetime column
        EXTRACT(MONTH FROM departure_datetime) AS departure_month,
        -- Count the number of reservations for each month
        COUNT(*) AS reservation_count
    FROM
        -- Select relevant columns from the flights table
        flights
    WHERE
        -- Filter flights departing in the year 2023
        EXTRACT(YEAR FROM departure_datetime) = 2023
    GROUP BY
        -- Group results by the extracted departure month
        departure_month
)

-- Identify the month with the maximum reservation count
SELECT
    -- Select the departure month with the highest reservation count
    departure_month,
    -- Include the maximum reservation count for reference
    reservation_count
FROM
    -- Subquery to calculate monthly reservation coun

Unnamed: 0,Query,Result,Latency
0,"WITH MonthlyReservationCounts AS (\n SELECT\n -- Extract the month from the departure_datetime column\n EXTRACT(MONTH FROM f.departure_datetime) AS departure_month,\n -- Count the number of reservations for each month\n COUNT(*) AS reservation_count\n FROM\n `flight_reservations.flights` AS f\n WHERE\n -- Filter flights departing in the year 2023\n EXTRACT(YEAR FROM f.departure_datetime) = 2023\n GROUP BY\n -- Group results by the extracted departure month\n departure_month\n)\n\n-- Identify the month with the maximum reservation count\nSELECT\n -- Select the departure month with the highest reservation count\n departure_month,\n -- Include the maximum reservation count for reference\n reservation_count\nFROM\n -- Subquery to calculate monthly reservation counts\n MonthlyReservationCounts\nWHERE\n -- Filter for the month with the maximum reservation count\n reservation_count = (SELECT MAX(reservation_count) FROM MonthlyReservationCounts)\nORDER BY\n -- Sort the results by reservation count in descending order\n reservation_count DESC;",departure_month reservation_count 0 11 8 1 12 8,4.414487
1,"-- Calculate the number of reservations for each month in 2023\nWITH MonthlyReservationCounts AS (\n SELECT\n -- Extract the month from the departure_datetime column\n EXTRACT(MONTH FROM `flights`.`departure_datetime`) AS departure_month,\n -- Count the number of reservations for each month\n COUNT(*) AS reservation_count\n FROM\n `flight_reservations`.`flights`\n WHERE\n -- Filter flights departing in the year 2023\n EXTRACT(YEAR FROM `flights`.`departure_datetime`) = 2023\n GROUP BY\n -- Group results by the extracted departure month\n departure_month\n)\n\n-- Identify the month with the maximum reservation count\nSELECT\n -- Select the departure month with the highest reservation count\n departure_month,\n -- Include the maximum reservation count for reference\n reservation_count\nFROM\n -- Subquery to calculate monthly reservation counts\n MonthlyReservationCounts\nWHERE\n -- Filter for the month with the maximum reservation count\n reservation_count = (SELECT MAX(reservation_count) FROM MonthlyReservationCounts)\nORDER BY\n -- Sort the results by reservation count in descending order\n reservation_count DESC;",departure_month reservation_count 0 11 8 1 12 8,4.638003


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,departure_month,reservation_count
0,11,8
1,12,8


In [None]:
question = "Group the customers into five distinct age brackets and count the number of customers in each bracket."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
-- Create a common table expression (CTE) called AgeBrackets
WITH AgeBrackets AS (
  
  -- Calculate the age of customers based on their date of birth
  SELECT
    c.customer_id,
    DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) AS age 
  FROM
    customers c
)

-- Select and group customers into age brackets and count the number of customers in each bracket
SELECT
  CASE
    WHEN age < 20 THEN 'Under 20'
    WHEN age BETWEEN 20 AND 29 THEN '20-29'
    WHEN age BETWEEN 30 AND 39 THEN '30-39'
    WHEN age BETWEEN 40 AND 49 THEN '40-49'
    ELSE '50 and above'
  END AS age_bracket,
  COUNT(customer_id) AS customer_count
FROM
  AgeBrackets
GROUP BY
  age_bracket
ORDER BY
  age_bracket;
--------------------------------------------------
FAILED
TRIAL: 2
--------------------------------------------------
WITH AgeBrackets AS (
  SELECT
    c.customer_id,
    DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) AS age
  FROM
    flight

Unnamed: 0,Query,Result,Latency
0,"WITH AgeBrackets AS (\n SELECT\n c.customer_id,\n DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) AS age\n FROM\n flight_reservations.customers c\n)\nSELECT\n CASE\n WHEN age < 20 THEN 'Under 20'\n WHEN age BETWEEN 20 AND 29 THEN '20-29'\n WHEN age BETWEEN 30 AND 39 THEN '30-39'\n WHEN age BETWEEN 40 AND 49 THEN '40-49'\n ELSE '50 and above'\n END AS age_bracket,\n COUNT(customer_id) AS customer_count\nFROM\n AgeBrackets\nGROUP BY\n age_bracket\nORDER BY\n age_bracket;",age_bracket customer_count 0 20-29 3 1 30-39 6 2 40-49 5 3 50 and above 6,6.064784


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,age_bracket,customer_count
0,20-29,3
1,30-39,6
2,40-49,5
3,50 and above,6


In [None]:
question = "Identify and rank all the customers aged 18+ who have `Confirmed` reservations for the current month, ordered by their age. Make sure to display their ages in the result."

In [None]:
prompt = seed_prompt.format(question, schema_columns)

In [None]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
WITH CurrentMonthReservations AS (
    SELECT
        r.customer_id
    FROM
        flight_reservations.reservations r
    JOIN
        flight_reservations.transactions t ON r.reservation_id = t.reservation_id
    WHERE
        t.transaction_datetime BETWEEN DATE_TRUNC(CURRENT_DATE(), MONTH) AND DATE_ADD(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH)
        AND r.status = 'Confirmed'
)

SELECT
    c.first_name,
    c.last_name,
    DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) AS age,
    RANK() OVER (ORDER BY DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) DESC) AS age_rank
FROM
    CurrentMonthReservations cmr
JOIN
    flight_reservations.customers c ON cmr.customer_id = c.customer_id
WHERE
    DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) >= 18;
--------------------------------------------------
SUCCEEDED
TRIAL: 2
--------------------------------------------------
SELECT
  c."customer_id",
  c."first_name",
 

Unnamed: 0,Query,Result,Latency
0,"WITH CurrentMonthReservations AS (\n SELECT\n r.customer_id\n FROM\n flight_reservations.reservations r\n JOIN\n flight_reservations.transactions t ON r.reservation_id = t.reservation_id\n WHERE\n t.transaction_datetime BETWEEN DATE_TRUNC(CURRENT_DATE(), MONTH) AND DATE_ADD(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH)\n AND r.status = 'Confirmed'\n)\n\nSELECT\n c.first_name,\n c.last_name,\n DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) AS age,\n RANK() OVER (ORDER BY DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) DESC) AS age_rank\nFROM\n CurrentMonthReservations cmr\nJOIN\n flight_reservations.customers c ON cmr.customer_id = c.customer_id\nWHERE\n DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) >= 18;","Empty DataFrame Columns: [first_name, last_name, age, age_rank] Index: []",5.502213


In [None]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,first_name,last_name,age,age_rank
