In [5]:
%load_ext autoreload
%autoreload 2

import sys  
sys.path.insert(1, '../')

In [1]:
import numpy as np
import faiss
from pandas import DataFrame
from datetime import datetime

In [2]:
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = 'us-central1'
PROJECT_ID

'cdii-poc'

# Query and Embeddings in local file and Local Vector indices Search

## Initialise

In [10]:
from nl2sql_src.nl2sql_query_embeddings import Nl2Sql_embed
nse = Nl2Sql_embed()

## Add new question to the query library

In [11]:
new_question = ""
new_sql = ""
# insert the new question to the embeddings.json file 
# nse.insert_data(new_question, new_sql)

## Retrieve Data

In [12]:
embeddings_data = nse.load_embeddings()

## Create VectorDB

In [13]:
# Creates the Vector index and saves it in a local file 'saved_index' for later use
nse.create_vectordb_index()

12


## Search the VectorDB for closest matching queries

In [14]:
# Search for the closest matching queries
new_query = 'What county has the highest number of beneficiaries enrolled in the WIC per capita?'
output_json = nse.search_matching_queries(new_query)

In [15]:
output_json

[{'question': 'Which five counties have the lowest number of WIC authorized vendors compared to WIC participants?',
  'sql': 'SELECT Vendor_Location,(vendor_cnt/total_participants)*100 as vendor_participants_ratio FROM ((SELECT TRIM(Vendor_Location) AS Vendor_Location,COALESCE(SUM(SAFE_CAST(_Number_of_Participants_Redeemed_ AS INT64))) as total_participants FROM `cdii-poc.HHS_Program_Counts.wic-redemption-by-county-by-participant-category-data-2010-2018`  group by Vendor_Location) as participants JOIN (SELECT TRIM(COUNTY) AS COUNTY,count(VENDOR) as vendor_cnt FROM `cdii-poc.HHS_Program_Counts.women-infants-and-children-wic-authorized-vendors`  group by COUNTY having COUNTY is not null) as vendors ON UPPER(participants.Vendor_Location)=UPPER(vendors.COUNTY)) WHERE (vendor_cnt/total_participants)*100 is not null order by vendor_participants_ratio asc limit 5;'},
 {'question': 'How has participation in CalFresh changed since 2015?',
  'sql': "select  year, sum(total_beneficiaries) as tota

# Query and Embeddings in PostgreSQL

## Initialise

In [19]:
%load_ext autoreload
%autoreload 2
from nl2sql_src.nl2sql_query_embeddings import PgSqlEmb

PGPROJ = "cdii-poc"
PGLOCATION = 'us-central1'
PGINSTANCE = "cdii-demo-temp"
PGDB = "demodbcdii"
PGUSER = "postgres"
PGPWD = "cdii-demo"
# INDX_FILE = 'saved_index_pgdata' # optoinal, considered as default

pge = PgSqlEmb(PGPROJ, PGLOCATION, PGINSTANCE, PGDB, PGUSER, PGPWD)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create Table

In [20]:
# Create Table in PostgresDB if not already existing
pge.create_table()

## Retrieve data 

In [21]:
# Retrieve the data from the PostgreDB and return a dataframe
def extract_data():
    tmp = pge.extract_data()
    df = DataFrame(tmp.fetchall())
    print(df)
    return df

df = extract_data()

   q_id                                           question  \
0   397       How many of them live in Los Angeles County?   
1   398  How has participation in CalFresh changed sinc...   
2   399  How do CalFresh program participation trends d...   
3   400  How have these race and ethnicity trends chang...   
4   401  Which programs have the highest co-enrollment ...   
5   402  Which five counties have the lowest number of ...   
6   403  How many Black individuals are served across C...   
7   404                   What is the breakdown by program   

                                                 sql  \
0  SELECT SUM(CAST(Person AS INT64)) AS total_ben...   
1  select  year, sum(total_beneficiaries) as tota...   
2  SELECT COALESCE(SUM(SAFE_CAST( White   AS INT6...   
3  SELECT fileyear  AS calhhs_dashboard_2015_2020...   
4  SELECT COALESCE(SUM(SAFE_CAST( Person   AS INT...   
5  SELECT Vendor_Location,(vendor_cnt/total_parti...   
6  SELECT  COALESCE(SUM(SAFE_CAST( Black   AS INT

## Clear all Data in DB

In [None]:
pge.empty_table()
df = extract_data()

## Add rows data to DB

In [None]:
import json
with open("embeddings.json", "r") as f:
    data = json.load(f)
    
iter = 1
num_rows = 15
for elem in data:
    if iter > num_rows:
        break
    pge.insert_row(elem['question'], elem['sql']) #, elem['question_embedding'])
    iter += 1
        
# df = retrieve_data()
# df

In [22]:
df = extract_data()
df

   q_id                                           question  \
0   397       How many of them live in Los Angeles County?   
1   398  How has participation in CalFresh changed sinc...   
2   399  How do CalFresh program participation trends d...   
3   400  How have these race and ethnicity trends chang...   
4   401  Which programs have the highest co-enrollment ...   
5   402  Which five counties have the lowest number of ...   
6   403  How many Black individuals are served across C...   
7   404                   What is the breakdown by program   

                                                 sql  \
0  SELECT SUM(CAST(Person AS INT64)) AS total_ben...   
1  select  year, sum(total_beneficiaries) as tota...   
2  SELECT COALESCE(SUM(SAFE_CAST( White   AS INT6...   
3  SELECT fileyear  AS calhhs_dashboard_2015_2020...   
4  SELECT COALESCE(SUM(SAFE_CAST( Person   AS INT...   
5  SELECT Vendor_Location,(vendor_cnt/total_parti...   
6  SELECT  COALESCE(SUM(SAFE_CAST( Black   AS INT

Unnamed: 0,q_id,question,sql,query_embedding
0,397,How many of them live in Los Angeles County?,SELECT SUM(CAST(Person AS INT64)) AS total_ben...,"[-0.02457154355943203, -0.0190061517059803, -0..."
1,398,How has participation in CalFresh changed sinc...,"select year, sum(total_beneficiaries) as tota...","[0.012649155221879482, 0.00679197208955884, -0..."
2,399,How do CalFresh program participation trends d...,SELECT COALESCE(SUM(SAFE_CAST( White AS INT6...,"[0.00884826947003603, -0.014465585350990295, -..."
3,400,How have these race and ethnicity trends chang...,SELECT fileyear AS calhhs_dashboard_2015_2020...,"[0.04074668511748314, -0.0178860891610384, -0...."
4,401,Which programs have the highest co-enrollment ...,SELECT COALESCE(SUM(SAFE_CAST( Person AS INT...,"[0.02137165702879429, -0.013408240862190723, -..."
5,402,Which five counties have the lowest number of ...,"SELECT Vendor_Location,(vendor_cnt/total_parti...","[0.04454430192708969, -0.047861456871032715, -..."
6,403,How many Black individuals are served across C...,SELECT COALESCE(SUM(SAFE_CAST( Black AS INT...,"[0.017948927357792854, -0.0162324458360672, -0..."
7,404,What is the breakdown by program,"SELECT Program,COALESCE(SUM(SAFE_CAST( Black ...","[-0.0013795827981084585, -0.03709886968135834,..."


## Insert records to PostgreSQL and update index

In [None]:
# pge.insert_row(data[10]['question'], data[10]['sql'])
# pge.insert_row(data[11]['question'], data[11]['sql'])

## Search the VectorDB for closest matching queries

In [23]:
# Search for the closest matching queries
new_query = 'What county has the highest number of beneficiaries enrolled in the WIC per capita?'
output_json = pge.search_matching_queries(new_query)

In [None]:
output_json

## Clear VectorDB and re-create VectorDB from all data in PostgreSQL

In [24]:
pge.recreate_vectordb_index()

Number of elements :  8


## Create Matching Engine VectorDB for searches

In [None]:
end
UID = datetime.now().strftime("%m%d%H%M")
BUCKET_URI = f"gs://{PROJECT_ID}-vs-cdii-questions-{UID}"

In [None]:
from google.cloud import aiplatform
import jsonlines
aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
questions, sqls, embeddings = pge.extract_pg_embeddings()

In [None]:
EMBED_FILE = 'cdii_pg_embeddings.json'
q_id = 1
data = []
for q_emb in embeddings:
    jsonl = {
                "id": q_id,
                "embedding" : q_emb
            }
    q_id += 1
    data.append(jsonl)


with jsonlines.open(EMBED_FILE, mode='w') as writer:
    for item in data:
        writer.write(item)

! gsutil mb -l "$LOCATION" -p "$PROJECT_ID" "$BUCKET_URI"
! gsutil cp "$EMBED_FILE" "$BUCKET_URI"

In [None]:
# Create the Index
me_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"vs_cdii_queries_index_{UID}",
    contents_delta_uri = BUCKET_URI,
    dimensions = 768,
    approximate_neighbors_count = 3
)


In [None]:
# Create Search Endpoint and Deploy 
me_index_ep = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = f"vs_cdii_ep_{UID}",
    public_endpoint_enabled=True
)


In [None]:
DEPLOYED_INDEX_ID = f"vs_cdii_deployed_index_{UID}"
me_index_ep.deploy_index(index=me_index, deployed_index_id=DEPLOYED_INDEX_ID)

In [None]:
new_query = 'What county has the highest number of beneficiaries enrolled in the WIC per capita?'

q_embeddings, _ = nse.generate_embedding(new_query)

response = me_index_ep.find_neighbors(
    deployed_index_id = DEPLOYED_INDEX_ID,
    queries = [q_embeddings],
    num_neighbors=3
)
response

In [None]:
for idx, neighbor in enumerate(response[0]):
    indx = int(neighbor.id)
    print(questions[indx-1], '\n', sqls[indx-1])

## Test clearing of index file

In [None]:
from faiss import write_index, read_index, IDSelectorBatch
index = read_index('saved_index_pgdata-tmp')

In [None]:
idsel = IDSelectorBatch(768, faiss.swig_ptr(ids))
index.remove_ids(idsel)