# Install & import libraries

In [None]:
# !pip3 install --user -r base_requirements.txt

In [None]:
import re
import langchain
import pandas as pd
from tqdm import tqdm

from nl2sql import AskBQ, show

# Table Filtering Module

In [None]:
from nl2sql import table_filter
from table_filter_test import run_tests_generate_csv

### Single run

In [None]:
from nl2sql import table_filter

tables_list = table_filter("What were my sales for yesterday?")
tables_list

In [None]:
# run_tests_generate_csv()

# Model Configuration

In [None]:
ask = AskBQ(
    location = "us-central1",
    project_id = "poc-project-guleria",
    #dataset_id = "genai_poc",
    dataset_id = "final_genai_poc",
    table_names = ["authorizations_search"],
    enum_option_limit = 1000,
    result_row_limit = 1000,
    log_bucket="fiserve_logs",
    data_dict_loc= 'data_dictionary.json',
    postprocessors=['case_handler_transform'],
    executor_chain = ['prompt_strategy', 'queryfix_strategy', 'agent_strategy'],
)

# Single Query Run

In [None]:
%%time
response = ask(
    "What were my sales for yesterday?"
)

In [None]:
print(response.latest_sql)

# Batch Queries Run

In [None]:
## authorisations_search
questions = [
('What were my sales for yesterday?',
"""
SELECT
  SUM(a_amount) AS total_amount,
  COUNT(a_authCode) AS auth_code_count
FROM
  authorizations_search
WHERE
  DATE(a_transactionDateTime) = CURRENT_DATE() - INTERVAL 1 DAY;
"""
),
('What is my approval rate for last 2 weeks?',
"""
SELECT
  COUNTIF(a_approvalCode = 'Approved') / NULLIF(COUNT(a_authCode),0) as total_txn
FROM
  authorizations_search
WHERE
  a_transactionDateTime BETWEEN TIMESTAMP_SUB(CURRENT_DATE(), INTERVAL 14 DAY) AND CURRENT_DATE();
"""
),
('How many transactions went to Visa and Mastercard for the last month?',
"""
SELECT
  COUNT(a_authCode) as approved_txn
FROM
  authorizations_search
WHERE
  a_transactionDateTime BETWEEN TIMESTAMP_SUB(CURRENT_DATE(), INTERVAL 30 DAY) AND CURRENT_DATE()
AND a_network IN ('Visa', 'Mastercard');
"""
),
('List all online declined discover transactions for today.',
"""
SELECT
  COUNT(a_authCode) as approved_txn
FROM
  authorizations_search
WHERE
  a_transactionDateTime = CURRENT_DATE()  AND a_network = 'Discover' and a_approvalCode = 'Declined' and a_paymentMethod = 'Mobile & Ecommerce';
"""
),
    ('What settled past week?',
"""
SELECT
  COUNT(1) AS total_count,
  SUM(st_processedTransactionAmount) AS total_amount
FROM
  settlement_search
WHERE
  DATE(st_batchDate) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
  AND DATE(st_batchDate) < CURRENT_DATE();
"""
),
('What was the total sale amount for transactions settled via debit networks?',
"""
SELECT
  COUNT(1) AS total_count,
  SUM(st_processedTransactionAmount) AS total_amount
FROM
  settlement_search
WHERE
  DATE(st_batchDate) = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
  AND st_productCode = 'Debit Card';
"""
),
('Show me settlement summary by plan code for last week for swiped transactions?',
"""
SELECT
  st_planCode,
  COUNT(1) AS total_count,
  SUM(st_processedTransactionAmount) AS total_amount
FROM
  settlement_search
WHERE
  DATE(st_batchDate) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
  AND DATE(st_batchDate) < CURRENT_DATE()
GROUP BY
  st_planCode;
"""
),
('What are my total deposits broken down by deposit type last week?',
"""
SELECT
  fd_depositTypeCd,
  SUM(fd_netSalesAmt) AS TotalProcessedNetSales
FROM
  funding_search
WHERE
  DATE(fd_fundedDate) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
  AND fd_majorCategory = 'Deposit'
GROUP BY
  fd_depositTypeCd;
"""
),
('Which day within the last month was my highest funding?',
"""
SELECT
  fd_fundedDate
FROM
  funding_search
WHERE
  fd_fundedDate >= DATE_SUB(CURRENT_DATE, INTERVAL '1' MONTH)
GROUP BY
  fd_fundedDate
ORDER BY
  SUM(fd_netSalesAmt) DESC
LIMIT 1;
"""
),
('What is the Fees to Deposit ratio for yesterdays bank deposit?',
"""
SELECT
  (SUM(CASE WHEN fd_majorCategory = 'Deposit' THEN fd_netSalesAmt ELSE 0 END) / SUM(fd_netSalesAmt)) * 100 AS Deposit_Percentage,
  (SUM(CASE WHEN fd_majorCategory = 'Fee' THEN fd_netSalesAmt ELSE 0 END) / SUM(fd_netSalesAmt)) * 100 AS Fee_Percentage
FROM
  funding_search
WHERE
  DATE(fd_fundedDate) = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
  AND fd_majorCategory IN ('Deposit', 'Fee');
"""
),
('What is the Win to Loss ratio on disputes last month?',
"""
SELECT
  (SUM(CASE WHEN `Major Category` = 'Deposit' THEN `Processed Net Sales` ELSE 0 END) / SUM(`Processed Net Sales`)) * 100 AS `Deposit Percentage`,
  (SUM(CASE WHEN `Major Category` = 'Fee' THEN `Processed Net Sales` ELSE 0 END) / SUM(`Processed Net Sales`)) * 100 AS `Fee Percentage`
FROM
  chargebacks_search
WHERE
  `funded_date` = CURRENT_DATE - INTERVAL 1 DAY
  AND `Major Category` IN ('Deposit', 'Fee');
"""
),
('How many disputes over $100 do I have?',
"""
SELECT COUNT(1) AS count_of_records
FROM chargebacks_search
WHERE cb_statusDate >= DATE_SUB(CURRENT_DATE, INTERVAL '1' MONTH)
  AND cb_chargebackAmount > 100;
"""
),
('What are my top 5 stores from where I am getting the most disputes?',
"""
SELECT
  cb_siteID,
  COUNT(1) AS count_of_records
FROM
  chargebacks_search
WHERE cb_statusDate >= DATE_SUB(CURRENT_DATE, INTERVAL '1' MONTH)
  AND EXTRACT(YEAR FROM cb_statusDate) = EXTRACT(YEAR FROM CURRENT_DATE())
GROUP BY
  cb_siteID
ORDER BY
  count_of_records DESC
LIMIT 5;
"""
),
]

In [None]:
llm = langchain.llms.VertexAI(temperature=0, model_name="text-bison@latest", max_output_tokens=2048)

In [None]:
def auto_verify(nl_description, ground_truth, llm_amswer):
    prompt = f'''You are an expert at validating SQL queries. Given the Natrual language description
      and the SQL query corresponding to that description, please check if the students answer is correct.
      There can be different ways to achieve the same result by forming the query differently.
      If the students SQL query matches the ground truth and fits the NL description correctly, then return yes
      else return no.
      Natural language description: {nl_description}
      Ground truth: {ground_truth}
      students answer: {llm_amswer}
    '''
    return llm(prompt)


In [None]:
def run_batch(questions):
    out = []
    for q, ground_truth in tqdm(questions):
        print(q)
        table_name = table_filter(q)
        print("Table name is: ", table_name)
        ask = AskBQ(
                    location = "us-central1",
                    project_id = "poc-project-guleria",
                    #dataset_id = "genai_poc",
                    dataset_id = "final_genai_poc",
                    table_names = table_name,
                    enum_option_limit = 1000,
                    result_row_limit = 1000,
                    log_bucket="fiserve_logs",
                    data_dict_loc= 'data_dictionary.json',
                    postprocessors=['case_handler_transform'],
                    executor_chain = ['prompt_strategy', 'queryfix_strategy', 'agent_strategy'],
                    )
        response = ask(q).latest_sql
        rating = auto_verify(q, ground_truth, response)
        out.append((q, ground_truth, response, rating))

    df = pd.DataFrame(out, columns=['question', 'ground_truth', 'llm_response', 'llm_rating'])

    return df

In [None]:
result_df = run_batch(questions)


In [None]:
result_df

In [None]:
id_ = 12
print(result_df.iloc[id_].question)
print(result_df.iloc[id_].ground_truth)
print('-'*100)
print(result_df.iloc[id_].llm_response)

In [None]:
result_df.to_csv('results1.csv', index=False)


# Validation

In [None]:
# !pip install google-cloud-bigquery

import pandas as pd
from google.cloud import bigquery

PROJECT_ID = 'poc-project-guleria'
dataset_name = "final_genai_poc"

# Initialize a BigQuery client
client = bigquery.Client(project = PROJECT_ID)

### Preprocessing [Add dataset name in queries]

In [None]:
def add_dataset_to_query(dataset_name, original_query):
    # Use a regular expression to find the table name after "FROM"
    match = re.search(r'FROM\s+([^\s]+)', original_query)
    
    if match:
        # Extract the table name from the match
        table_name = match.group(1)

        # Modify the query to include dataset name
        modified_query = re.sub(r'FROM\s+([^\s]+)', f'FROM {dataset_name}.{table_name}', original_query)

        return modified_query
    else:
        raise ValueError("Invalid query format")

In [None]:
#Creating separate lists for ground truth and llm response
ground_truth_queries,llm_response_queries  = result_df["ground_truth"],result_df["llm_response"]
ground_truth_queries = ground_truth_queries.to_list()
llm_response_queries = llm_response_queries.to_list()

In [None]:
# Apply the function to each query in the list
llm_response_new_queries = [add_dataset_to_query(dataset_name, query) for query in llm_response_queries]
ground_truth_new_queries = [add_dataset_to_query(dataset_name, query) for query in ground_truth_queries]

### Execute Query on BigQuery

In [None]:
def execute_query(query):
    # Run the SQL query
    query_job = client.query(query)

    #print("Query:", query)
    # Wait for the job to complete
    query_job.result()

    # Fetch the result if needed
    results = query_job.to_dataframe()

    return results

In [None]:
# Compare only shape
def compare_shape(ground_truth_result, llm_response_result):
    
    if ground_truth_result.shape == llm_response_result.shape:
        return True
    return False

In [None]:
# Compare shape and content
def compare_content(ground_truth_result, llm_response_result):

    if ground_truth_result.equals(llm_response_result):
        return True
    return False

In [None]:
compare_shape_result=[]
compare_content_result=[]

# Iterate over the queries with an index
for index,(ground_truth_query, llm_response_query) in enumerate(zip(ground_truth_new_queries, llm_response_new_queries)):
  # Execute the SQL query
  print("Ground Truth query: ", ground_truth_query)
  print("LLM Response query: ", llm_response_query)
  ground_truth_result = execute_query(ground_truth_query)
  llm_response_result = execute_query(llm_response_query)

  # Print or process the results along with the index
  print(f"Question: {index+1}")
  print("Ground Truth: ", ground_truth_result)
  print("LLM Response: ", llm_response_result)

  compare_shape_result.append(compare_shape(ground_truth_result, llm_response_result))
  compare_content_result.append(compare_content(ground_truth_result, llm_response_result))


#print(compare_shape_result)
compare_shape_df = pd.DataFrame(compare_shape_result, columns=['shape_compare_rating'])
compare_content_df = pd.DataFrame(compare_content_result, columns=['content_compare_rating'])

In [None]:
compare_results = pd.concat([result_df[["question", "ground_truth", "llm_response", "llm_rating"]], compare_shape_df, compare_content_df], axis=1)
compare_results

In [None]:
compare_results.to_csv('query_results.csv', index=False)

In [None]:
import gradio as gr

def gen_sql(input_text):
    # Process the SQL query or text input here
    return f"Processed: {input_text}"

# Predefined values for the dropdown
predefined_values = [
    "What were my sales for yesterday?",
    "What is my approval rate for last 2 weeks?",
    "How many transactions went to Visa and Mastercard for the last month?"
]

# Dropdown for predefined values
dropdown = gr.Interface(
    fn=None,  # Placeholder function as the actual function will be provided later
    inputs=gr.inputs.Dropdown(
        label="Select Text",
        choices=predefined_values,
        default=None,
    ),
    layout="horizontal",  # Arrange inputs horizontally
    label=None  # No label for the input block
)

# Textbox for custom input
custom_input = gr.Interface(
    fn=None,  # Placeholder function as the actual function will be provided later
    inputs=gr.inputs.Textbox(label="Or Enter Custom Text"),
    layout="horizontal",  # Arrange inputs horizontally
    label=None  # No label for the input block
)

# Combine dropdown and custom input blocks vertically
input_block = gr.Interface(
    inputs=[dropdown, custom_input],
    layout="vertical",  # Arrange inputs vertically
    label="Select or Add Text"
)

# Output block: Textbox for displaying the output
output_block = gr.Interface(
    fn=gen_sql,
    inputs=gr.inputs.Textbox(label="Input"),  # Placeholder input for the output block
    outputs=gr.outputs.Textbox(label="Output Text"),
    layout="unaligned",  # Output layout is unaligned
    label=None  # No label for the output block
)

# Assemble input and output blocks into the main interface
interface = gr.Interface(
    inputs=input_block,
    outputs=output_block,
)

if __name__ == "__main__":
    interface.launch(share=True, show=True)


In [None]:
import gradio as gr
from custom_prompt_chain import *

In [None]:
import gradio as gr
from custom_prompt_chain import *



with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
    gr.Markdown("Select a question or type your own and then click **Run**")
    with gr.Row():
        question = gr.Dropdown(choices=["What were my sales for yesterday?", 
                                   "What is my approval rate for last 2 weeks?", 
                                   "How many disputes over $100 do I have?"], 
                          label="Select a question")
        custom_question = gr.Textbox(label="Or type your own question")  # New text input field
        sql_to_nl = gr.Checkbox(label="Select for natural language output")
        need_insights = gr.Checkbox(label="Select for insights")
        
    sql_out = gr.Textbox(label="SQL Query")
    results_out = gr.Textbox(label="Results")
    insights_out = gr.Textbox(label="Insights")
    
    btn = gr.Button("Run")
    btn.click(fn=gen_sql, inputs=[question, custom_question, sql_to_nl, need_insights] , outputs=[sql_out, results_out, insights_out])

demo.launch(share=True)

In [1]:
import pandas as pd
import gradio as gr
from custom_prompt_chain import *

In [2]:
test_file_name = "dataset/test_dataset_all_tables.csv"
test_df = pd.read_csv(test_file_name)
questions = test_df.question.tolist()
del questions[10]
del questions[1]

In [None]:
with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
    gr.Markdown("Select a question or type your own and then click **Run**")
    with gr.Row():
        question = gr.Dropdown(choices=questions, 
                          label="Select a question", allow_custom_value=True, scale=2.5)
        sql_to_nl = gr.Checkbox(label="Natural language ouput")
        need_insights = gr.Checkbox(label="Detailed Insights")
    
    btn = gr.Button("Run")
        
    sql_out = gr.Textbox(label="SQL Query")
    results_out = gr.Textbox(label="Results")
    insights_out = gr.Textbox(label="Insights")
    
    btn.click(fn=gen_sql, inputs=[question, sql_to_nl, need_insights] , outputs=[sql_out, results_out, insights_out])

demo.launch(share=True, debug=True)

  question = gr.Dropdown(choices=questions,


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://dedf1d4e93c9c4aef9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
