In [1]:
%load_ext autoreload
%autoreload 2

import sys  
sys.path.insert(1, '../')

In [2]:
import numpy as np
import faiss
from pandas import DataFrame
from datetime import datetime

In [3]:
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = 'us-central1'
PROJECT_ID

'cdii-poc'

# Query and Embeddings in local file and Local Vector indices Search

## Initialise

In [4]:
from nl2sql_src.nl2sql_query_embeddings import Nl2Sql_embed
nse = Nl2Sql_embed()

## Add new question to the query library

In [5]:
new_question = ""
new_sql = ""
# insert the new question to the embeddings.json file 
# nse.insert_data(new_question, new_sql)

## Retrieve Data

In [6]:
embeddings_data = nse.load_embeddings()

## Create VectorDB

In [7]:
# Creates the Vector index and saves it in a local file 'saved_index' for later use
nse.create_vectordb_index()

12


## Search the VectorDB for closest matching queries

In [8]:
# Search for the closest matching queries
new_query = 'What county has the highest number of beneficiaries enrolled in the WIC per capita?'
output_json = nse.search_matching_queries(new_query)

In [9]:
output_json

[{'question': 'Which county has the lowest enrollment in WIC per capita?',
  'sql': 'SELECT Vendor_Location  AS wic_redemption_by_county_by_participant_category_data_2010_2018_vendor_location, COALESCE(SUM(SAFE_CAST(( SUBSTR (Total_Cost_Vouchers,2,  (INSTR (Total_Cost_Vouchers, ".") -2))  ) AS INT64)), 0) AS wic_redemption_by_county_by_participant_category_data_2010_2018_total_cost_of_vouchers_1 FROM `cdii-poc.HHS_Program_Counts.wic-redemption-by-county-by-participant-category-data-2010-2018`  AS wic_redemption_by_county_by_participant_category_data_2010_2018 WHERE (Total_Cost_Vouchers) <> \'*\' OR (Total_Cost_Vouchers) IS NULL GROUP BY wic_redemption_by_county_by_participant_category_data_2010_2018_vendor_location ORDER BY wic_redemption_by_county_by_participant_category_data_2010_2018_total_cost_of_vouchers_1 ASC limit 1;'},
 {'question': 'Which two counties have the highest number of WIC authorized vendors compared to WIC participants?',
  'sql': 'SELECT Vendor_Location,(vendor_cnt/

# Query and Embeddings in PostgreSQL

## Initialise

In [10]:
%load_ext autoreload
%autoreload 2
from nl2sql_src.nl2sql_query_embeddings import PgSqlEmb

PGPROJ = "cdii-poc"
PGLOCATION = 'us-central1'
PGINSTANCE = "cdii-demo-temp"
PGDB = "demodbcdii"
PGUSER = "postgres"
PGPWD = "cdii-demo"
# INDX_FILE = 'saved_index_pgdata' # optoinal, considered as default

pge = PgSqlEmb(PGPROJ, PGLOCATION, PGINSTANCE, PGDB, PGUSER, PGPWD)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create Table

In [11]:
# Create Table in PostgresDB if not already existing
pge.create_table()

## Retrieve data 

In [15]:
# Retrieve the data from the PostgreDB and return a dataframe
def extract_data():
    tmp = pge.extract_data()
    df = DataFrame(tmp.fetchall())
    print(df)
    return df

df = extract_data()

Empty DataFrame
Columns: []
Index: []


## Clear all Data in DB

In [13]:
pge.empty_table()
df = extract_data()

Empty DataFrame
Columns: []
Index: []


## Add rows data to DB

In [16]:
import json
with open("../dataset/embeddings.json", "r") as f:
    data = json.load(f)
    
iter = 1
num_rows = 15
for elem in data:
    if iter > num_rows:
        break
    pge.insert_row(elem['question'], elem['sql']) #, elem['question_embedding'])
    iter += 1
        
# df = retrieve_data()
# df

1 length of new array
1 length of new array
1 length of new array
1 length of new array
1 length of new array
1 length of new array
1 length of new array
1 length of new array
1 length of new array
1 length of new array
1 length of new array
1 length of new array


In [23]:
df = extract_data()
df

    q_id                                           question  \
0    417  How many people are enrolled for Medi-Cal prog...   
1    418            How many of them live in Nevada County?   
2    419  How has participation in Medi-Cal progressed o...   
3    420  How does CalWorks program participation trends...   
4    421  In what way the race and ethnicity trends chan...   
5    422  Which county has the greatest proportion of FP...   
6    423  Which programs have the highest co-enrollment ...   
7    424  Which county has the lowest enrollment in WIC ...   
8    425  Which two counties have the highest number of ...   
9    426  How many Hispanic beneficiaries are being serv...   
10   427                   What is the breakdown by program   
11   428  Which counties have the highest and lowest rat...   

                                                  sql  \
0   SELECT  COALESCE(SUM(SAFE_CAST( Person AS INT6...   
1   SELECT SUM(CAST(Person AS INT64)) AS total_ben...   
2   selec

Unnamed: 0,q_id,question,sql,query_embedding
0,417,How many people are enrolled for Medi-Cal prog...,SELECT COALESCE(SUM(SAFE_CAST( Person AS INT6...,"[0.009485369548201561, 0.003129353281110525, -..."
1,418,How many of them live in Nevada County?,SELECT SUM(CAST(Person AS INT64)) AS total_ben...,"[-0.03890972211956978, -0.01862681470811367, -..."
2,419,How has participation in Medi-Cal progressed o...,"select year, sum(total_beneficiaries) as tota...","[0.05054508149623871, 0.001993116457015276, -0..."
3,420,How does CalWorks program participation trends...,SELECT COALESCE(SUM(SAFE_CAST( White AS INT6...,"[0.04451335594058037, -0.030554158613085747, -..."
4,421,In what way the race and ethnicity trends chan...,SELECT fileyear AS calhhs_dashboard_2015_2020...,"[0.04810802638530731, -0.012473777867853642, -..."
5,422,Which county has the greatest proportion of FP...,"SELECT Number AS county, COALESCE(SUM(SAFE_CAS...","[0.03396085649728775, -0.019975077360868454, -..."
6,423,Which programs have the highest co-enrollment ...,SELECT COALESCE(SUM(SAFE_CAST( Person AS INT...,"[0.04807215929031372, -0.01924615353345871, -0..."
7,424,Which county has the lowest enrollment in WIC ...,SELECT Vendor_Location AS wic_redemption_by_c...,"[0.050651632249355316, -0.04634733870625496, -..."
8,425,Which two counties have the highest number of ...,"SELECT Vendor_Location,(vendor_cnt/total_parti...","[0.030228829011321068, -0.058050718158483505, ..."
9,426,How many Hispanic beneficiaries are being serv...,SELECT COALESCE(SUM(SAFE_CAST( Hispanic AS ...,"[0.011656521819531918, 0.01710379682481289, -0..."


## Insert records to PostgreSQL and update index

In [None]:
# pge.insert_row(data[10]['question'], data[10]['sql'])
# pge.insert_row(data[11]['question'], data[11]['sql'])

## Search the VectorDB for closest matching queries

In [25]:
# Search for the closest matching queries
new_query = 'What county has the highest number of beneficiaries enrolled in the WIC per capita?'
output_json = pge.search_matching_queries(new_query)

In [26]:
output_json

[{'question': 'Which county has the lowest enrollment in WIC per capita?',
  'sql': 'SELECT Vendor_Location  AS wic_redemption_by_county_by_participant_category_data_2010_2018_vendor_location, COALESCE(SUM(SAFE_CAST(( SUBSTR (Total_Cost_Vouchers,2,  (INSTR (Total_Cost_Vouchers, ".") -2))  ) AS INT64)), 0) AS wic_redemption_by_county_by_participant_category_data_2010_2018_total_cost_of_vouchers_1 FROM `cdii-poc.HHS_Program_Counts.wic-redemption-by-county-by-participant-category-data-2010-2018`  AS wic_redemption_by_county_by_participant_category_data_2010_2018 WHERE (Total_Cost_Vouchers) <> \'*\' OR (Total_Cost_Vouchers) IS NULL GROUP BY wic_redemption_by_county_by_participant_category_data_2010_2018_vendor_location ORDER BY wic_redemption_by_county_by_participant_category_data_2010_2018_total_cost_of_vouchers_1 ASC limit 1;'},
 {'question': 'Which two counties have the highest number of WIC authorized vendors compared to WIC participants?',
  'sql': 'SELECT Vendor_Location,(vendor_cnt/

## Clear VectorDB and re-create VectorDB from all data in PostgreSQL

In [24]:
pge.recreate_vectordb_index()

Number of elements :  12


## Create Matching Engine VectorDB for searches

In [None]:
end
UID = datetime.now().strftime("%m%d%H%M")
BUCKET_URI = f"gs://{PROJECT_ID}-vs-cdii-questions-{UID}"

In [None]:
from google.cloud import aiplatform
import jsonlines
aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
questions, sqls, embeddings = pge.extract_pg_embeddings()

In [None]:
EMBED_FILE = 'cdii_pg_embeddings.json'
q_id = 1
data = []
for q_emb in embeddings:
    jsonl = {
                "id": q_id,
                "embedding" : q_emb
            }
    q_id += 1
    data.append(jsonl)


with jsonlines.open(EMBED_FILE, mode='w') as writer:
    for item in data:
        writer.write(item)

! gsutil mb -l "$LOCATION" -p "$PROJECT_ID" "$BUCKET_URI"
! gsutil cp "$EMBED_FILE" "$BUCKET_URI"

In [None]:
# Create the Index
me_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"vs_cdii_queries_index_{UID}",
    contents_delta_uri = BUCKET_URI,
    dimensions = 768,
    approximate_neighbors_count = 3
)


In [None]:
# Create Search Endpoint and Deploy 
me_index_ep = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = f"vs_cdii_ep_{UID}",
    public_endpoint_enabled=True
)


In [None]:
DEPLOYED_INDEX_ID = f"vs_cdii_deployed_index_{UID}"
me_index_ep.deploy_index(index=me_index, deployed_index_id=DEPLOYED_INDEX_ID)

In [None]:
new_query = 'What county has the highest number of beneficiaries enrolled in the WIC per capita?'

q_embeddings, _ = nse.generate_embedding(new_query)

response = me_index_ep.find_neighbors(
    deployed_index_id = DEPLOYED_INDEX_ID,
    queries = [q_embeddings],
    num_neighbors=3
)
response

In [None]:
for idx, neighbor in enumerate(response[0]):
    indx = int(neighbor.id)
    print(questions[indx-1], '\n', sqls[indx-1])

## Test clearing of index file

In [None]:
from faiss import write_index, read_index, IDSelectorBatch
index = read_index('saved_index_pgdata-tmp')

In [None]:
idsel = IDSelectorBatch(768, faiss.swig_ptr(ids))
index.remove_ids(idsel)