In [1]:
%load_ext autoreload
%autoreload 2

import sys  
sys.path.insert(1, '../')

In [18]:
from nl2sql_src.nl2sql_generic import Nl2sqlBq_rag
import pandas as pd
from pandas import DataFrame

import json

import vertexai
from vertexai.language_models import TextGenerationModel

In [19]:
project_id = "cdii-poc"
dataset_id = "HHS_Program_Counts"

In [20]:
questions = ["How many people are enrolled in CalFresh?",
             "Which county has the greatest proportion of CalFresh recipients co-enrolled in at least one additional program?",
             "What county has the greatest enrollment in WIC per capita?",
              #"How many Black individuals are served across CalHHS programs?",
              "Which counties have the highest and lowest ratios of providers to enrolled participants in Medi-Cal?",
             ]

In [21]:
question = questions[1]
question
# question = 'Which county has the greatest proportion of CalFresh recipients co-enrolled in at least one additional program? '

'Which county has the greatest proportion of CalFresh recipients co-enrolled in at least one additional program?'

### Generate SQL with zero-shot prompting

In [22]:
# Initializing when metadata cache is already created
meta_data_json_path = "../cache_metadata/metadata_cache.json"
# meta_data_json_path = "../cache_metadata/updated-metadata.json"
nl2sqlbq_client = Nl2sqlBq_rag(project_id=project_id,
                           dataset_id=dataset_id,
                           metadata_json_path = meta_data_json_path, #"../cache_metadata/metadata_cache.json",
                           model_name="text-bison"
                           # model_name="code-bison"
                          )


In [23]:
print(question)
table_info = nl2sqlbq_client.table_filter(question)
table_info

sql_query = nl2sqlbq_client.text_to_sql(question)
print(sql_query)

Which county has the greatest proportion of CalFresh recipients co-enrolled in at least one additional program?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
Which county has the greatest proportion of CalFresh recipients co-enrolled in at least one additional program?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
 
SELECT 
  county,
  (CAST(_1_program AS FLOAT) / CAST(CalFresh AS FLOAT)) AS prop_coenrolled
FROM cdii-poc.HHS_Program_Counts.`calhhs-dashboard-2015-2020-annual-data-file`
WHERE 
  CalFresh > 0
ORDER BY 
  prop_coenrolled DESC
LIMIT 1;



## Using PostgreSQL and VectorDB indices to search for Closest matching queries for Few-shot prompting

### Initialize the PostgreDB
Please make sure the PostgreDB instance and database are created

In [24]:
# Table name is 'documents'

PGPROJ = "cdii-poc"
PGLOCATION = 'us-central1'
PGINSTANCE = "cdii-demo-temp"
PGDB = "demodbcdii"
PGUSER = "postgres"
PGPWD = "cdii-demo"

nl2sqlbq_client.init_pgdb(PGPROJ, PGLOCATION, PGINSTANCE, PGDB, PGUSER, PGPWD)




### Sql Generation with few-shot prompting

In [25]:
print(question)
table_info = nl2sqlbq_client.table_filter(question)
table_info

sql_query = nl2sqlbq_client.text_to_sql_fewshot(question)
print(sql_query)

Which county has the greatest proportion of CalFresh recipients co-enrolled in at least one additional program?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
Which county has the greatest proportion of CalFresh recipients co-enrolled in at least one additional program?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
Table name  calhhs-dashboard-2015-2020-annual-data-file
 
SELECT Number AS county, 
       COALESCE(SUM(CAST(Person AS INT64)), 0) AS total_calfresh_recipients,
       COALESCE(SUM(CAST(_2_programs AS INT64)), 0) AS num_calfresh_recipients_co_enrolled_in_at_least_one_other_program,
       COALESCE(SUM(CAST(_2_programs AS INT64)) / SUM(CAST(Person AS INT64)), 0) AS prop_calfresh_recipients_co_enrolled_in_at_least_one_other_program
FROM `cdii-poc.HHS_Program_Counts.calhhs-dashboard-2015-2020-annual-data-file`
WHERE Level = 'County'
  AND (Number <> 'County Total' OR Number IS NULL)
  AND Program = 'CalFresh'
GROUP BY county
ORDER BY prop_calf

In [26]:
# nl2sqlbq_client_raw = Nl2sqlBq(project_id=project_id,
#                            dataset_id=dataset_id,
#                            metadata_json_path = "metadata_cache.json",
#                            model_name="text-bison"
#                           )
# question = questions[1]
# print(question)
# table_info = nl2sqlbq_client_raw.table_filter(question)
# table_info

# sql_query = nl2sqlbq_client_raw.text_to_sql(question)
# print(sql_query)

In [27]:
all_questions = ["How many people are enrolled in CalFresh?",
    "How many of them live in Los Angeles County?",
    "How has participation in CalFresh changed since 2015?",
    "How do CalFresh program participation trends differ by race and ethnicity?",
    "How have these race and ethnicity trends changed over time?",
    "Which county has the greatest proportion of CalFresh recipients co-enrolled in at least one additional program?",
    "What about three or more additional programs?",
    "Which programs have the highest co-enrollment with CalFresh?",
    "What county has the greatest enrollment in WIC per capita?",
    "Which five counties have the lowest number of WIC authorized vendors compared to WIC participants?",
    "How do infant mortality rates, low birthweight rates, and preterm and very preterm rates compare to WIC enrollment rates by county?",
    "How many Black individuals are served across CalHHS programs?",
    "What is the breakdown by program?",
    "Has this changed over time?",
    "Change over time by program?",
    "Which counties have the highest and lowest ratios of providers to enrolled participants in Medi-Cal?",
    "What is the ratio of non-suspended doctors to Medi-Cal members by County?",
    "What about the ratio to licensed facilities?"
            ]

In [28]:
print(len(all_questions))
results = []
failed_questions = []
for question in all_questions:
    tmp = {}
    print(question)
    tmp['question'] = question
    tmp['generated_sql'] = "SQL not generated"
    try:
        table_info = nl2sqlbq_client.table_filter(question)
        table_info

        sql_query = nl2sqlbq_client.text_to_sql_fewshot(question)
        print(sql_query)
        tmp['question'] = question
        tmp['generated_sql'] = sql_query
    except:
        failed_questions.append(question)
        pass
    
    results.append(tmp)

18
How many people are enrolled in CalFresh?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
How many people are enrolled in CalFresh?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
Table name  calhhs-dashboard-2015-2020-annual-data-file
 
SELECT 
  fileyear,
  county,
  CalFresh
FROM cdii-poc.HHS_Program_Counts.`calhhs-dashboard-2015-2020-annual-data-file`
WHERE 
  Program = 'CalFresh';

How many of them live in Los Angeles County?
Table Filter -  ['gdhcs-imdopen-datamedmed-calfreshmed-medi-cal_by-cnty_calfresh-dec-19']
How many of them live in Los Angeles County?
Table Filter -  ['gdhcs-imdopen-datamedmed-calfreshmed-medi-cal_by-cnty_calfresh-dec-19']
Table name  gdhcs-imdopen-datamedmed-calfreshmed-medi-cal_by-cnty_calfresh-dec-19
 
SELECT 
  County,
  SUM(Number_of_Beneficiaries) AS total_beneficiaries
FROM cdii-poc.HHS_Program_Counts.`gdhcs-imdopen-datamedmed-calfreshmed-medi-cal_by-cnty_calfresh-dec-19`
WHERE 
  County = 'Los Angeles'
GROUP BY 
  

Traceback (most recent call last):
  File "/home/jupyter/nl2sql-fiserv/temp/lib-nl2sql/nl_2_sql_lib/final_lib/notebooks/../nl2sql_src/nl2sql_generic.py", line 322, in text_to_sql_fewshot
    table_json = self.metadata_json[table_name]
KeyError: 'low-and-very-low-birthweight-by-race-ethnicity-2014-2018\npreterm-and-very-preterm-births-by-raceethnicity-2010-2018'


Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
Which county has the greatest proportion of CalFresh recipients co-enrolled in at least one additional program?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
Table name  calhhs-dashboard-2015-2020-annual-data-file
 
SELECT Number AS county, 
       COALESCE(SUM(CAST(Person AS INT64)), 0) AS total_calfresh_recipients,
       COALESCE(SUM(CAST(_2_programs AS INT64)), 0) AS num_calfresh_recipients_co_enrolled_in_at_least_one_other_program,
       COALESCE(SUM(CAST(_2_programs AS INT64)) / SUM(CAST(Person AS INT64)), 0) AS prop_calfresh_recipients_co_enrolled_in_at_least_one_other_program
FROM `cdii-poc.HHS_Program_Counts.calhhs-dashboard-2015-2020-annual-data-file`
WHERE Level = 'County'
  AND (Number <> 'County Total' OR Number IS NULL)
  AND Program = 'CalFresh'
GROUP BY county
ORDER BY prop_calfresh_recipients_co_enrolled_in_at_least_one_other_program DESC
LIMIT 1;

What about three or more additional pro

Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..


Table Filter -  ['abcd-350-annual-recipient-report-county-data']
Table name  abcd-350-annual-recipient-report-county-data


Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..
Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..
Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 8.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..
Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 10.0 seconds as it raised InvalidArgument: 400 The request cannot be proce

Which programs have the highest co-enrollment with CalFresh?
Table Filter -  ['abcd-350-annual-recipient-report-county-data']
Which programs have the highest co-enrollment with CalFresh?


Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..


Table Filter -  ['abcd-350-annual-recipient-report-county-data']
Table name  abcd-350-annual-recipient-report-county-data


Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..
Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..
Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 8.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..
Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 10.0 seconds as it raised InvalidArgument: 400 The request cannot be proce

What county has the greatest enrollment in WIC per capita?
Table Filter -  ['wic-redemption-by-county-by-participant-category-data-2010-2018']
What county has the greatest enrollment in WIC per capita?
Table Filter -  ['wic-redemption-by-county-by-participant-category-data-2010-2018']
Table name  wic-redemption-by-county-by-participant-category-data-2010-2018
 
SELECT
  Vendor_Location,
  COALESCE(
    SUM(CAST(_Number_of_Participants_Redeemed_ AS INT64)),
    0
  ) AS total_participants
FROM `cdii-poc.HHS_Program_Counts.wic-redemption-by-county-by-participant-category-data-2010-2018`
GROUP BY
  Vendor_Location
ORDER BY
  total_participants DESC
LIMIT 1;

Which five counties have the lowest number of WIC authorized vendors compared to WIC participants?
Table Filter -  ['women-infants-and-children-wic-authorized-vendors']
Which five counties have the lowest number of WIC authorized vendors compared to WIC participants?
Table Filter -  ['women-infants-and-children-wic-authorized-vendors'

Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..


Table Filter -  ['abcd-350-annual-recipient-report-county-data']
Table name  abcd-350-annual-recipient-report-county-data


Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..
Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..
Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 8.0 seconds as it raised InvalidArgument: 400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit..
Retrying langchain_google_vertexai.llms._completion_with_retry.<locals>._completion_with_retry_inner in 10.0 seconds as it raised InvalidArgument: 400 The request cannot be proce

What is the breakdown by program?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
What is the breakdown by program?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
Table name  calhhs-dashboard-2015-2020-annual-data-file
 
SELECT Program, 
       COALESCE(SUM(CAST(Number as BIGNUMERIC)), 0) AS total_enrolled
FROM `cdii-poc.HHS_Program_Counts.calhhs-dashboard-2015-2020-annual-data-file`
GROUP BY Program
ORDER BY total_enrolled DESC;

Has this changed over time?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
Has this changed over time?
Table Filter -  ['calhhs-dashboard-2015-2020-annual-data-file']
Table name  calhhs-dashboard-2015-2020-annual-data-file
 
SELECT
  fileyear,
  COALESCE(SUM(SAFE_CAST(White AS INT64)), 0) AS total_white,
  COALESCE(SUM(SAFE_CAST(Black AS INT64)), 0) AS total_black,
  COALESCE(SUM(SAFE_CAST(Hispanic AS INT64)), 0) AS total_hispanic,
  COALESCE(SUM(SAFE_CAST(Asian_PI AS INT64)), 0) AS total_asian,
  COALESCE(SUM

Traceback (most recent call last):
  File "/home/jupyter/nl2sql-fiserv/temp/lib-nl2sql/nl_2_sql_lib/final_lib/notebooks/../nl2sql_src/nl2sql_generic.py", line 322, in text_to_sql_fewshot
    table_json = self.metadata_json[table_name]
KeyError: 'The question asks for counties with the highest and lowest ratios of providers to enrolled participants in Medi-Cal. The table that is most likely to contain the relevant data is calhhs_dashboard-2015-2020-annual-data-file.'


Table Filter -  ['calhhs_medi_cal_ffs_provider_listing_2_21_24']
What is the ratio of non-suspended doctors to Medi-Cal members by County?
Table Filter -  ['calhhs_medi_cal_ffs_provider_listing_2_21_24']
Table name  calhhs_medi_cal_ffs_provider_listing_2_21_24
 
SELECT
  c.CountyName,
  COUNT(DISTINCT CASE WHEN p.Provider_License IS NOT NULL THEN p.Provider_Number END) AS non_suspended_doctors,
  SUM(SAFE_CAST(a.Person AS INT64)) AS medi_cal_members,
  (
    COUNT(DISTINCT CASE WHEN p.Provider_License IS NOT NULL THEN p.Provider_Number END) /
    SUM(SAFE_CAST(a.Person AS INT64))
  ) AS ratio_non_suspended_doctors_to_medi_cal_members
FROM cdii-poc.HHS_Program_Counts.`calhhs_medi_cal_ffs_provider_listing_2_21_24` p
JOIN
  calhhs_dashboard_2015_2020_annual_data_file a ON p.CountyName = a.CountyName
JOIN
  County c ON p.CountyName = c.CountyName
WHERE
  a.Program = 'Medi-Cal' AND
  a.level = 'County'
GROUP BY
  c.CountyName
ORDER BY
  ratio_non_suspended_doctors_to_medi_cal_members DESC;


In [29]:
# Generate SQL for a specific question and add to the list
# tmp = {}
# question = all_questions[15]
# print(question)
# table_info = nl2sqlbq_client.table_filter(question)
# table_info

# sql_query = nl2sqlbq_client.text_to_sql_fewshot(question)
# print(sql_query)
# tmp['question'] = question
# tmp['generated_sql'] = sql_query
    
# results.append(tmp)

In [30]:
results

[{'question': 'How many people are enrolled in CalFresh?',
  'generated_sql': " \nSELECT \n  fileyear,\n  county,\n  CalFresh\nFROM cdii-poc.HHS_Program_Counts.`calhhs-dashboard-2015-2020-annual-data-file`\nWHERE \n  Program = 'CalFresh';\n"},
 {'question': 'How many of them live in Los Angeles County?',
  'generated_sql': " \nSELECT \n  County,\n  SUM(Number_of_Beneficiaries) AS total_beneficiaries\nFROM cdii-poc.HHS_Program_Counts.`gdhcs-imdopen-datamedmed-calfreshmed-medi-cal_by-cnty_calfresh-dec-19`\nWHERE \n  County = 'Los Angeles'\nGROUP BY \n  County;\n"},
 {'question': 'How has participation in CalFresh changed since 2015?',
  'generated_sql': ' \nSELECT\n  Year,\n  SUM(CalFresh_Enrollment) AS CalFresh_Enrollment\nFROM\n  medi-cal-and-calfresh-enrollment\nWHERE\n  Year >= 2015\nGROUP BY\n  Year\nORDER BY\n  Year;\n'},
 {'question': 'How do CalFresh program participation trends differ by race and ethnicity?',
  'generated_sql': " \nSELECT \n  fileyear,\n  COALESCE(SUM(SAFE_CAST(

In [31]:
len(results)

18

In [32]:
columns = ['question', 'generated_sql']
df = pd.DataFrame(results, columns=columns)
df.to_csv('output.csv', index=False)

In [33]:
df

Unnamed: 0,question,generated_sql
0,How many people are enrolled in CalFresh?,"\nSELECT \n fileyear,\n county,\n CalFresh..."
1,How many of them live in Los Angeles County?,"\nSELECT \n County,\n SUM(Number_of_Benefic..."
2,How has participation in CalFresh changed sinc...,"\nSELECT\n Year,\n SUM(CalFresh_Enrollment)..."
3,How do CalFresh program participation trends d...,"\nSELECT \n fileyear,\n COALESCE(SUM(SAFE_C..."
4,How have these race and ethnicity trends chang...,SQL not generated
5,Which county has the greatest proportion of Ca...,"\nSELECT Number AS county, \n COALESCE(..."
6,What about three or more additional programs?,SQL not generated
7,Which programs have the highest co-enrollment ...,SQL not generated
8,What county has the greatest enrollment in WIC...,"\nSELECT\n Vendor_Location,\n COALESCE(\n ..."
9,Which five counties have the lowest number of ...,"\nSELECT Vendor_Location,\n (vendor_cnt..."


In [34]:
fq1 = "How have these race and ethnicity trends changed over time?"
table_name1 = 'calhhs-dashboard-2015-2020-annual-data-file'

fq2 = "Which five counties have the lowest number of WIC authorized vendors compared to WIC participants?"
table_name2 = 'wic-redemption-by-county-by-participant-category-data-2010-2018'

fq3 = "What is the breakdown by program?"
table_name3 = 'calhhs-dashboard-2015-2020-annual-data-file'

fq4 = "Has this changed over time?"
table_name4 = 'calhhs-dashboard-2015-2020-annual-data-file'

fq5 = "What about the ratio to licensed facilities?"
table_name5 = 'calhhs-dashboard-2015-2020-annual-data-file'

In [35]:
regen_question = fq1
regen_table = table_name1

# Generate SQL by passing the table name as well

sql_query = nl2sqlbq_client.text_to_sql_fewshot(regen_question, table_name=regen_table)
print(sql_query)

How have these race and ethnicity trends changed over time?
Table name  calhhs-dashboard-2015-2020-annual-data-file
 
SELECT
  fileyear,
  SUM(White) AS total_white,
  SUM(Black) AS total_black,
  SUM(Hispanic) AS total_hispanic,
  SUM(Asian_PI) AS total_asian,
  SUM(Native_American) AS total_native_american
FROM `cdii-poc.HHS_Program_Counts.calhhs-dashboard-2015-2020-annual-data-file`
GROUP BY
  fileyear
ORDER BY
  fileyear;



In [36]:
regen_question = fq2
regen_table = table_name2

# Generate SQL by passing the table name as well

sql_query = nl2sqlbq_client.text_to_sql_fewshot(regen_question, table_name=regen_table)
print(sql_query)

Which five counties have the lowest number of WIC authorized vendors compared to WIC participants?
Table name  wic-redemption-by-county-by-participant-category-data-2010-2018
 
SELECT Vendor_Location,
       (vendor_cnt/total_participants)*100 AS vendor_participants_ratio
FROM (
    SELECT TRIM(Vendor_Location) AS Vendor_Location,
           COALESCE(SUM(CAST(_Number_of_Participants_Redeemed_ AS INT64)), 0) AS total_participants
    FROM `cdii-poc.HHS_Program_Counts.wic-redemption-by-county-by-participant-category-data-2010-2018``
    GROUP BY Vendor_Location) AS participants
JOIN (
    SELECT TRIM(COUNTY) AS COUNTY,
           COUNT(VENDOR) AS vendor_cnt
    FROM `cdii-poc.HHS_Program_Counts.women-infants-and-children-wic-authorized-vendors
    GROUP BY COUNTY
    HAVING COUNTY IS NOT NULL) AS vendors
ON UPPER(participants.Vendor_Location) = UPPER(vendors.COUNTY)
WHERE (vendor_cnt/total_participants)*100 IS NOT NULL
ORDER BY vendor_participants_ratio ASC
LIMIT 5;



In [37]:
regen_question = fq3
regen_table = table_name3

# Generate SQL by passing the table name as well

sql_query = nl2sqlbq_client.text_to_sql_fewshot(regen_question, table_name=regen_table)
print(sql_query)

What is the breakdown by program?
Table name  calhhs-dashboard-2015-2020-annual-data-file
 
SELECT Program, 
       COALESCE(SUM(CAST(Number as BIGNUMERIC)), 0) AS total_enrolled
FROM `cdii-poc.HHS_Program_Counts.calhhs-dashboard-2015-2020-annual-data-file`
GROUP BY Program
ORDER BY total_enrolled DESC;



In [38]:
regen_question = fq4
regen_table = table_name4

# Generate SQL by passing the table name as well

sql_query = nl2sqlbq_client.text_to_sql_fewshot(regen_question, table_name=regen_table)
print(sql_query)

Has this changed over time?
Table name  calhhs-dashboard-2015-2020-annual-data-file
 
SELECT
  fileyear,
  COALESCE(SUM(SAFE_CAST(White AS INT64)), 0) AS total_white,
  COALESCE(SUM(SAFE_CAST(Black AS INT64)), 0) AS total_black,
  COALESCE(SUM(SAFE_CAST(Hispanic AS INT64)), 0) AS total_hispanic,
  COALESCE(SUM(SAFE_CAST(Asian_PI AS INT64)), 0) AS total_asian,
  COALESCE(SUM(SAFE_CAST(Native_American AS INT64)), 0) AS total_native_american
FROM `cdii-poc.HHS_Program_Counts.calhhs-dashboard-2015-2020-annual-data-file`
GROUP BY
  fileyear
ORDER BY
  fileyear;



In [39]:
regen_question = fq5
regen_table = table_name5

# Generate SQL by passing the table name as well

sql_query = nl2sqlbq_client.text_to_sql_fewshot(regen_question, table_name=regen_table)
print(sql_query)

What about the ratio to licensed facilities?
Table name  calhhs-dashboard-2015-2020-annual-data-file
 
SELECT
  annual_file.Number,
  (
    COUNT(DISTINCT provider.License) /
    SUM(CAST(annual_file.Person AS INT64))
  ) * 100 AS ratio_to_licensed_facilities
FROM `cdii-poc.HHS_Program_Counts.calhhs-dashboard-2015-2020-annual-data-file` AS annual_file
JOIN
  cdii-poc.HHS_Program_Counts.calhhs_medi-cal_managed_care_provider_listing AS provider
ON
  annual_file.Number = provider.County
WHERE
  annual_file.Level = 'County'
  AND provider.RecordType = 'Provider'
GROUP BY
  annual_file.Number
ORDER BY
  ratio_to_licensed_facilities DESC
LIMIT 5;



In [40]:
failed_questions

['How have these race and ethnicity trends changed over time?',
 'What about three or more additional programs?',
 'Which programs have the highest co-enrollment with CalFresh?',
 'How many Black individuals are served across CalHHS programs?',
 'Which counties have the highest and lowest ratios of providers to enrolled participants in Medi-Cal?']