In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
proj_path = Path('.').resolve()
sys.path.append(str(proj_path))

import matplotlib.pyplot as plt
import seaborn as sns


import json
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd
from typing import Optional
from collections import defaultdict
from dotenv import load_dotenv, find_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain_core.runnables import RunnableSequence
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate


_ = load_dotenv(find_dotenv())

from src.db_utils import get_schema_str, get_data_dict
from src.pymodels import (
    DatabaseModel, 
    SpiderSample, 
    BirdSample, 
    BODescription,
    SQLResponse
)
from src.prompts import Prompts
from src.database import SqliteDatabase
from src.data_preprocess import (
    load_raw_data,
    process_all_tables,
    filter_samples_by_count_spider_bird,
    process_samples_bird,
    split_train_dev_test,
    save_samples_spider_bird,
    load_samples_spider_bird,
)

from src.parsing_sql import Schema, extract_all
from src.eval_utils import get_complexity, result_eq, check_if_exists_orderby
from run_bo_sql import get_vector_store
from copy import deepcopy
bird_path = proj_path / 'data' / 'bird'
tables, train_data, dev_data = load_raw_data(bird_path, load_test=False)

with (proj_path / 'data' / 'bird_description.json').open() as f:
    all_descriptions = json.load(f)

bird_tables = process_all_tables(tables, descriptions=all_descriptions)
train_samples = load_samples_spider_bird(proj_path / 'data' / 'bird_train.json')
dev_samples = load_samples_spider_bird(proj_path / 'data' / 'bird_dev.json')
test_samples = load_samples_spider_bird(proj_path / 'data' / 'bird_test.json')

In [3]:
# from langchain.globals import set_llm_cache
# from langchain_community.cache import SQLiteCache
# set_llm_cache(SQLiteCache(database_path=f"./cache/valid_bo_bird_dev.db"))

In [612]:
file_name = 'soccer_2016-0'
db = SqliteDatabase(f"./cache/valid_bo_{file_name}.db")
db.start()
c = db.con.cursor()
c.execute('BEGIN TRANSACTION')
c.execute("""
DELETE FROM full_llm_cache WHERE response LIKE '%JSONDecodeError%';
""")
# c.execute('''
# DELETE FROM full_llm_cache
# WHERE prompt LIKE '%Which teams have had a player awarded the Purple Cap and another with the Orange Cap%'
# ''')
db.con.commit()
db.close()
# JSONDecodeError
# ValidationError

In [611]:
from pprint import pprint
df = db.execute(
'''
SELECT * FROM full_llm_cache
WHERE prompt LIKE '%Which teams have had a player awarded the Purple Cap and another with the Orange Cap%'
''')
df

Unnamed: 0,prompt,llm,idx,response


---

In [35]:
from src.eval_utils import get_structural_score, get_all_structural_score, get_all_semantic_score, NLP_SPACY, partial_matching_with_penalty
from run_evaluation import get_target_parsed_sql, get_prediction_parsed_sql
from run_bo_sql import _get_categories, _format_interval, get_retriever
from bert_score import score as bscore
from transformers import logging as tfloggings
tfloggings.set_verbosity_error()
import warnings
ds = 'bird'
task = 'zero_shot_hint'
typ = 'test'
scenario = 3
description_file = f'{ds}_description.json' # f'{ds}_description.json'
experiment_folder = proj_path / 'experiments' / ds
prediction_path = experiment_folder / 'predictions' / task
eval_path = experiment_folder / 'evals' / task

tables, *_ = load_raw_data(proj_path / 'data' / ds, load_test=False)
with (proj_path / 'data' / description_file).open() as f:
    all_descriptions = json.load(f)
tables = process_all_tables(tables, descriptions=all_descriptions)

In [56]:
file_post_fix = f'{ds}_{typ}' if scenario < 0 else f'{ds}_{typ}_{scenario}'
final_file = f'final_{file_post_fix}.json'
if not (prediction_path / final_file).exists():
    all_results = []
    paths = sorted(list(prediction_path.glob(f'{file_post_fix}_*.json')))
    for p in paths:
        with p.open() as f:
            results = json.load(f)
            for r in results:
                r.pop('rationale')
                r['db_id'] = p.stem.split('_', 3)[-1]
            all_results.extend(results)

In [66]:
preds = all_results
samples = load_samples_spider_bird(proj_path / 'data' / f'{ds}_{typ}.json')

In [67]:
res_db_ids = {r['db_id'] for r in preds}

In [None]:
preds = [p for p in preds if p['db_id'] in res_db_ids]
samples = [s for s in samples if s.db_id in res_db_ids]

In [68]:
pred_parsed, _ = get_prediction_parsed_sql(preds, tables)

superstore: 100%|██████████| 24/24 [00:00<00:00, 26.37it/s]


In [72]:
[sample_id for _, sample_id_ast in pred_parsed.items() for sample_id, ast in sample_id_ast.items() if not ast]

[]

In [7]:
all_results = []
for p in eval_path.glob(f'{ds}_{typ}_*.json'):
    with p.open() as f:
        all_results.extend(json.load(f))

df = pd.DataFrame(all_results)
df.to_csv(eval_path / f'{ds}_{typ}.csv')

In [8]:
df_base = pd.read_csv(experiment_folder / 'evals' / 'zero_shot' / f'{ds}_{typ}.csv')
df_bo = pd.read_csv(experiment_folder / 'evals' / 'valid_bo' / f'{ds}_{typ}.csv')

In [9]:
df_cates = df_base.groupby('db_id')['gold_complexity'].apply(_get_categories).rename('category').apply(_format_interval)
df_base = pd.merge(df_base, df_cates.reset_index('db_id', drop=True), left_index=True, right_index=True)

df = pd.merge(
    left=df_bo,
    right=df_base,
    how='inner',
    on=['db_id', 'sample_id', 'gold_complexity'],
    suffixes=('_bo', '_base')
)

In [10]:
group_column = ['db_id', 'retrieved']
execution_improvement = df.groupby(group_column)[['exec_result_base', 'exec_result_bo']].sum().diff(axis=1)['exec_result_bo'].rename('execution_improvement')
merit_structural = df.groupby(group_column)[['structural_score_base', 'structural_score_bo']].mean().diff(axis=1)['structural_score_bo'].rename('merit_structural')
merit_semantic = df.groupby(group_column)[['semantic_score_base', 'semantic_score_bo']].mean().diff(axis=1)['semantic_score_bo'].rename('merit_semantic')
merit = df.groupby(group_column)[['f1_score_base', 'f1_score_bo']].mean().diff(axis=1)['f1_score_bo'].rename('merit')

ranks = merit.reset_index().groupby(['db_id'])['merit'].rank(method='first', ascending=False).rename('rank').astype(np.int64)
merit = pd.concat([merit.reset_index(), ranks], axis=1)
merit_by_rank = merit.sort_values(by=['db_id', 'rank'], ascending=True)

# merit.head()

In [25]:
(merit_by_rank.groupby('db_id')['merit'].mean().sort_values(ascending=False) > 0.0).value_counts()

merit
True     59
False    13
Name: count, dtype: int64

In [12]:
test_bos = defaultdict(list)
for x in merit_by_rank.loc[:, ['db_id', 'retrieved']].to_dict(orient='records'):
    test_bos[x['db_id']].append(x['retrieved'])

n_bos = range(5, 26, 5)
test_scenarios = defaultdict(dict)
for n_bo in n_bos:
    for db_id in test_bos:
        test_scenarios[n_bo][db_id] = test_bos[db_id][:n_bo]

In [13]:
with (experiment_folder / 'test_scenarios.json').open('w') as f:
    json.dump(test_scenarios, f)

In [536]:
with (experiment_folder / 'predictions' / 'create_bo' / f'final_{ds}_train_bo.json').open() as f:
    all_bos = json.load(f)

test_bo_ids = test_scenarios[25]
test_bos = defaultdict(list)
for db_id, bos in all_bos.items():
    if db_id in test_bo_ids:
        bo_ids = test_bo_ids[db_id]
        test_bos[db_id].extend(list(filter(lambda x: x['sample_id'] in bo_ids, bos)))

In [617]:
for db_id, bos in all_bos.items():
    for bo in bos:
        if not bo.get('question'):
            print(db_id, bo['sample_id'])

In [541]:
vectorstore = get_vector_store(test_bos)

In [563]:
test_samples = load_samples_spider_bird(proj_path / 'data' / f'{ds}_test.json')

In [564]:
samples = [x for x in test_samples if x.db_id in test_bo_ids]

In [546]:
retriever = get_retriever(vectorstore, 'manufactory_1')

In [None]:
q = 'What are the names and prices of products that cost at least 180, sorted by price decreasing and name ascending?'
retriever.invoke(q)

[Document(metadata={'sample_id': 5318, 'db_id': 'manufactory_1', 'vt': 'SELECT products.name, products.price FROM products'}, page_content="The virtual table describes the names and prices of products from the 'products' table."),
 Document(metadata={'sample_id': 5320, 'db_id': 'manufactory_1', 'vt': 'SELECT products.name FROM products WHERE products.price <= [placeholder-type:numeric]'}, page_content="The virtual table describes the names of products from the 'products' table that have a retail price less than or equal to a specified value. The placeholder in the WHERE clause represents the maximum price limit for filtering the products."),
 Document(metadata={'sample_id': 5343, 'db_id': 'manufactory_1', 'vt': 'SELECT products.code, products.name, MIN(products.price) FROM products GROUP BY products.name'}, page_content="The virtual table displays the unique product codes and names from the 'products' table along with the minimum price for each product. The results are grouped by produ

In [265]:
final_bos = []
n_samples = 10
for db_id, group in merit.groupby('db_id'):
    # sample by rank, need to check unique ranks
    group = group.sort_values('rank').iloc[:n_samples]
    final_bos.append(group)

In [266]:
final_bos[0]

Unnamed: 0,db_id,retrieved,merit,rank
32,activity_1,6776,-0.001267,1
15,activity_1,6736,-0.001267,1
14,activity_1,6735,-0.001267,1
37,activity_1,6784,-0.001267,1
20,activity_1,6750,-0.001267,1
8,activity_1,6725,-0.001267,1
39,activity_1,6786,-0.001267,1
25,activity_1,6761,-0.001267,1
27,activity_1,6764,-0.001267,1
30,activity_1,6774,-0.001267,1


In [257]:
merit.groupby('db_id')['merit'].describe().sort_values('mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
db_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
epinions_1,18.0,0.806900,1.142410e-16,0.806900,0.806900,0.806900,0.806900,0.806900
gas_company,27.0,0.377211,1.635068e-01,0.233700,0.233700,0.233700,0.556600,0.556600
loan_1,46.0,0.267752,2.932192e-01,-0.113888,-0.068069,0.500000,0.500000,0.500000
local_govt_and_lot,10.0,0.258400,0.000000e+00,0.258400,0.258400,0.258400,0.258400,0.258400
small_bank_1,30.0,0.224476,1.876357e-01,0.039994,0.039994,0.224476,0.408958,0.408958
...,...,...,...,...,...,...,...,...
poker_player,19.0,-0.082278,1.946600e-03,-0.084500,-0.084500,-0.080663,-0.080663,-0.080663
browser_web,10.0,-0.088820,1.922665e-02,-0.094900,-0.094900,-0.094900,-0.094900,-0.034100
cre_Doc_Template_Mgt,30.0,-0.112200,2.823006e-17,-0.112200,-0.112200,-0.112200,-0.112200,-0.112200
wta_1,30.0,-0.159026,1.591789e-01,-0.294254,-0.294254,-0.170126,-0.170126,0.133629


In [207]:
from scipy.stats import rankdata



In [242]:
df_rank = merit.reset_index().groupby(['db_id'])['merit'].apply(lambda x: rankdata(-x.values, method='min'))

In [224]:
df_rank.loc['activity_1']

array([31, 31,  1,  1, 16, 16, 16, 31,  1, 31, 16, 31, 31, 31,  1,  1, 16,
       16,  1, 16,  1, 31, 16, 16, 16,  1,  1,  1, 31, 31,  1, 16,  1, 31,
       31, 31, 16,  1, 16,  1,  1, 16, 16])

In [None]:
df_rank.loc

KeyError: 'address'

In [216]:
x = np.array([2, -1, 0, 3])
rank = rankdata(-x, method='average')
print(rank)
np.argsort(rank)

[2. 4. 3. 1.]


array([3, 0, 2, 1])

In [169]:
df

Unnamed: 0,sample_id,sample_id_bo,db_id_bo,retrieved,gold_complexity_bo,structural_score_bo,semantic_score_bo,f1_score_bo,exec_result_bo,sample_id_base,db_id_base,gold_complexity_base,structural_score_base,semantic_score_base,f1_score_base,exec_result_base
326,103,103,student_assessment,60,10,0.3391,0.5255,0.412207,0,5808,workshop_paper,4,1.0000,1.0000,1.0000,1
327,65,65,student_assessment,60,4,0.6667,0.6667,0.666700,0,7247,flight_2,7,0.3904,0.3882,0.3892,1
328,89,89,student_assessment,60,9,0.3067,0.7112,0.428579,0,7322,cre_Doc_Template_Mgt,8,0.3317,0.5843,0.4025,1
329,81,81,student_assessment,60,10,0.3878,0.4971,0.435700,0,6314,e_government,7,1.0000,1.0000,1.0000,1
330,107,107,student_assessment,60,9,0.4850,0.5279,0.505542,0,5834,workshop_paper,8,1.0000,1.0000,1.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18207,857,857,chinook_1,868,8,0.4444,0.4445,0.444500,0,3300,college_1,11,1.0000,1.0000,1.0000,1
18208,312,312,product_catalog,309,9,0.8000,0.8412,0.820083,1,5908,cre_Theme_park,7,0.4975,0.5943,0.5380,0
18209,312,312,product_catalog,329,9,0.8000,0.8412,0.820083,0,5908,cre_Theme_park,7,0.4975,0.5943,0.5380,0
18210,312,312,product_catalog,324,9,0.8000,0.8412,0.820083,0,5908,cre_Theme_park,7,0.4975,0.5943,0.5380,0


In [155]:
df_base.columns

Index(['sample_id', 'db_id', 'gold_complexity', 'structural_score',
       'semantic_score', 'f1_score', 'exec_result'],
      dtype='object')

In [157]:
df_base.groupby(['db_id'])['structural_score'].mean()

db_id
activity_1           0.682253
aircraft             0.665956
allergy_1            0.756495
apartment_rentals    0.793388
architecture         0.739333
                       ...   
wine_1               0.810444
workshop_paper       0.806467
world_1              0.716850
wrestler             0.786637
wta_1                0.892736
Name: structural_score, Length: 158, dtype: float64

In [160]:
df_bo.groupby(['retrieved', 'db_id'])['structural_score'].mean()

retrieved  db_id      
16         farm           0.862625
18         farm           0.623800
19         farm           0.623800
20         farm           0.862625
21         farm           0.623800
                            ...   
7978       dog_kennels    0.542933
7981       dog_kennels    0.679733
7984       dog_kennels    0.679733
7986       dog_kennels    0.542933
7988       dog_kennels    0.542933
Name: structural_score, Length: 3209, dtype: float64

In [149]:
df.groupby(['db_id', 'retrieved'])['structural_score'].mean()

db_id       retrieved
activity_1  6711         0.665518
            6713         0.665518
            6714         0.538511
            6716         0.538511
            6717         0.546622
                           ...   
wta_1       7472         0.272850
            7475         0.272850
            7476         0.272850
            7477         0.698900
            7480         0.272850
Name: structural_score, Length: 3209, dtype: float64

In [147]:
df.columns

Index(['sample_id', 'db_id', 'retrieved', 'gold_complexity',
       'structural_score', 'semantic_score', 'f1_score', 'exec_result'],
      dtype='object')

In [5]:
paths = sorted(list(prediction_path.glob(f'{ds}_{typ}_*.json')))
print(f'Found {len(paths)} files')

# get target_parsed_sql
file_name = f'{ds}_{typ}_parsed.pkl'
samples = load_samples_spider_bird(proj_path / 'data' / f'{ds}_{typ}.json')
if not (eval_path / file_name).exists():
    target_parsed, error_ids = get_target_parsed_sql(samples, tables)
    with open(eval_path / file_name, 'wb') as f:
        pickle.dump(target_parsed, f)
    print(f'Error parsing target {ds}_{typ}: {len(error_ids)}')

with (eval_path / file_name).open('rb') as f:
    target_parsed = pickle.load(f)

Found 83 files


In [6]:
for p in paths:
    with p.open() as f:
        preds = json.load(f)
    output_results = []
    db_id = p.stem.split('_', 2)[-1].split('-')[0]
    file_name = f'{p.stem}_parsed_pred.pkl'
    if not (eval_path / file_name).exists():
        pred_parsed, _ = get_prediction_parsed_sql(preds, tables, patch_db_id=db_id)
        with open(eval_path / file_name, 'wb') as f:
            pickle.dump(pred_parsed, f)
        # print(f'Error parsing pred {args.ds}_{args.type}: {len(error_ids)}')

    with (eval_path / file_name).open('rb') as f:
        pred_parsed = pickle.load(f)
    break

In [132]:
preds[0]

{'sample_id': 2528,
 'gold_sql': 'SELECT SUM(T1.Installs), T2.Translated_Review FROM playstore AS T1 INNER JOIN user_reviews AS T2 ON T1.App = T2.App WHERE T1."Content Rating" = \'Adults only 18+\'',
 'retrieved': 2538,
 'rationale': ["Identify the relevant tables: 'playstore' for app details and 'user_reviews' for user reviews.",
  "Determine the filtering criteria: content rating of 'Adults only 18+'.",
  'Calculate the total installs for apps that meet the content rating criteria.',
  "Join the 'playstore' table with the 'user_reviews' table on the app name to access translated reviews.",
  'Use SUM() to calculate total installs and select translated reviews from the joined tables.',
  'Group the results by translated review to get a summary of installs per review.'],
 'pred_sql': "SELECT SUM(playstore.installs) AS total_installs, user_reviews.translated_review FROM playstore INNER JOIN user_reviews ON playstore.app = user_reviews.app WHERE playstore.content_rating = 'Adults only 18

In [135]:
db_id_retrieved = [(db_id, x['retrieved']) for x in preds]
len(db_id_retrieved)

60

In [111]:
target_parsed_outputs = [target_parsed[db_id][x['sample_id']] for x in preds]
pred_parsed_outputs = [pred_parsed[db_id][x['sample_id']] for x in preds]

structural_scores = get_all_structural_score(pred_parsed_outputs, target_parsed_outputs)
semantic_scores = get_all_semantic_score(pred_parsed_outputs, target_parsed_outputs)

In [124]:
# f1 score
epsilon = 1e-9
structural_scores = np.array(structural_scores)
semantic_scores = np.array(semantic_scores)
f1_scores = 2 * (structural_scores * semantic_scores) / (structural_scores + semantic_scores + epsilon)

In [10]:
def stringify(source_asts, target_asts):
    source_str = [str(ast) if use_bert else NLP_SPACY(str(ast)) for ast in source_asts]
    target_str = [str(ast) if use_bert else NLP_SPACY(str(ast)) for ast in target_asts]
    return list(zip(source_str, target_str))

args = ['table_asts', 'sel_asts', 'cond_asts', 'agg_asts', 'orderby_asts', 'subqueries', 'distinct', 'limit']    
use_bert = True
rescale_with_baseline = True
criteria = 'tsed'
penalty = 0.01

all_pairs = []
all_idxes = defaultdict(dict)
all_results = defaultdict(dict)
for k, (source_output, target_output) in enumerate(zip(pred_parsed_outputs, target_parsed_outputs)):
    for arg in args:
        source_exists = bool(source_output[arg]) if arg != 'subqueries' else bool(source_output[arg][1:])
        target_exists = bool(target_output[arg]) if arg != 'subqueries' else bool(target_output[arg][1:])
        if target_exists and source_exists:
            if arg in ['sel_asts', 'cond_asts', 'agg_asts', 'orderby_asts', 'table_asts']:
                source = [ast for _, ast, _ in source_output[arg]]
                target = [ast for _, ast, _ in target_output[arg]]
                pairs = stringify(source, target)
                idxes = list(range(len(all_pairs), len(all_pairs)+len(pairs)))
                all_pairs.extend(pairs)
                all_idxes[k][arg] = idxes
                semantic_score = -1
            elif arg == 'subqueries':
                source = source_output[arg][1:]
                target = target_output[arg][1:]
                pairs = stringify(source, target)
                idxes = list(range(len(all_pairs), len(all_pairs)+len(pairs)))
                all_pairs.extend(pairs)
                all_idxes[k][arg] = idxes
                semantic_score = -1
            elif arg in ['distinct', 'limit']:
                semantic_score = 1.0 if criteria == 'tsed' else 0.0
        elif target_exists ^ source_exists:
            semantic_score = 0.0 if criteria == 'tsed' else np.infty
        else:
            # they don't exist in both so, we can't measure the score
            semantic_score = None
        all_results[k][arg] = semantic_score

In [11]:
all_pairs[:5]

[('playstore', 'playstore'),
 ('user_reviews', 'user_reviews'),
 ('SUM(playstore.installs)', 'SUM(playstore.installs)'),
 ('user_reviews.translated_review', 'user_reviews.translated_review'),
 ("playstore.content_rating = '[placeholder-type:string]'",
  'playstore."content rating" = \'[placeholder-type:string]\'')]

In [12]:
all_idxes[0]

{'table_asts': [0, 1], 'sel_asts': [2, 3], 'cond_asts': [4]}

In [19]:
from itertools import product

In [13]:
source_str, target_str = zip(*all_pairs)

In [98]:
source_str_list = []
target_str_list = []
sparse_idxes = []
idx2arg = defaultdict()
for k, key_idxes in all_idxes.items():
    for arg, idxes in key_idxes.items():
        s, e = idxes[0], idxes[-1]+1
        xs = list(product(source_str[s:e], target_str[s:e]))
        s_str, t_str = list(zip(*xs))
        idx = len(xs) + (sparse_idxes[-1] if sparse_idxes else 0)
        sparse_idxes.append(idx)
        idx2arg[idx] = (k, arg)
        source_str_list.extend(s_str)
        target_str_list.extend(t_str)

In [97]:
idx2arg[4]

(0, 'table_asts')

In [99]:
from itertools import product, pairwise

source_str, target_str = zip(*all_pairs)
n, m = len(source_str), len(target_str)
if use_bert:
    source_str_list = []
    target_str_list = []
    sparse_idxes = []
    idx2arg = defaultdict(dict)
    for k, key_idxes in all_idxes.items():
        for arg, idxes in key_idxes.items():
            s, e = idxes[0], idxes[-1]+1
            xs = list(product(source_str[s:e], target_str[s:e]))
            s_str, t_str = list(zip(*xs))
            idx = len(xs) + (sparse_idxes[-1] if sparse_idxes else 0)
            sparse_idxes.append(idx)
            idx2arg[idx] = (k, arg)
            source_str_list.extend(s_str)
            target_str_list.extend(t_str)
    with warnings.catch_warnings(action='ignore'):
        *_, F1 = bscore(source_str_list, target_str_list, lang='en', verbose=False, rescale_with_baseline=rescale_with_baseline, device='cuda')
    scores = F1.numpy()
else:
    scores = []
    for k, key_idxes in all_idxes.items():
        for key, idxes in key_idxes.items():
            s, e = idxes[0], idxes[-1]+1
            s_str, t_str = list(zip(*product(source_str[s:e], target_str[s:e])))
            for i, s in enumerate(s_str):
                for j, t in enumerate(t_str):
                    scores.append(s.similarity(t))
    scores = np.array(scores)

In [103]:
scores[i:j]

array([1.0000007 , 0.04659584, 0.04659584, 1.0000014 ], dtype=float32)

In [105]:
for i, j in pairwise([0] + sparse_idxes):
    n = int(np.sqrt(j - i))
    matrix = scores[i:j].reshape(n, n)
    k, arg = idx2arg[j]
    *_, final_score = partial_matching_with_penalty(matrix, penalty, maximize=True)
    all_results[k][arg] = final_score

In [107]:
len(all_results)

60

In [89]:
source_output = pred_parsed_outputs[0]
target_output = target_parsed_outputs[0]

In [90]:
all_pairs = []
all_idxes = {}
results = {}
for arg in args:
    source_exists = bool(source_output[arg]) if arg != 'subqueries' else bool(source_output[arg][1:])
    target_exists = bool(target_output[arg]) if arg != 'subqueries' else bool(target_output[arg][1:])
    if target_exists and source_exists:
        if arg in ['sel_asts', 'cond_asts', 'agg_asts', 'orderby_asts', 'table_asts']:
            source = [ast for _, ast, _ in source_output[arg]]
            target = [ast for _, ast, _ in target_output[arg]]
            if target:
                pairs = stringify(source, target)
                idxes = list(range(len(all_pairs), len(all_pairs)+len(pairs)))
                all_pairs.extend(pairs)
                all_idxes[arg] = idxes
            semantic_score = -1
        elif arg == 'subqueries':
            source = source_output[arg][1:]
            target = target_output[arg][1:]
            if target:
                pairs = stringify(source, target)
                idxes = list(range(len(all_pairs), len(all_pairs)+len(pairs)))
                all_pairs.extend(pairs)
                all_idxes[arg] = idxes
            semantic_score = -1
        elif arg in ['distinct', 'limit']:
            semantic_score = 1.0 if criteria == 'tsed' else 0.0
    elif target_exists ^ source_exists:
        semantic_score = 0.0 if criteria == 'tsed' else np.infty
    else:
        # they don't exist in both so, we can't measure the score
        semantic_score = None
        # score = 0.0 if criteria == 'tsed' else np.infty
    results[arg] = semantic_score

In [97]:
from itertools import product
source_str, target_str = zip(*all_pairs)
n, m = len(source_str), len(target_str)
if use_bert:
    source_str_list, target_str_list = list(zip(*product(source_str, target_str)))
    with warnings.catch_warnings(action='ignore'):
        *_, F1 = bscore(source_str_list, target_str_list, lang='en', verbose=False, rescale_with_baseline=rescale_with_baseline, device='cuda')
    matrix = F1.numpy().reshape(n, m)
else:
    matrix = np.zeros((n, m), dtype=np.float32)
    for i, s in enumerate(source_str):
        for j, t in enumerate(target_str):
            matrix[i, j] = s.similarity(t)

In [94]:
penalty = 0.01

*_, final_score = partial_matching_with_penalty(matrix, penalty, maximize=True)

In [95]:
final_score

0.947024

In [3]:
# spider_path = proj_path / 'data' / 'spider'
# tables, train_data, dev_data = load_raw_data(spider_path, load_test=False)

# with (proj_path / 'data' / 'description.json').open() as f:
#     all_descriptions = json.load(f)
# seed = 42
# all_data = filter_samples_by_count_spider_bird(train_data+dev_data, n=10)

# with open(proj_path / 'data' / 'bird_skip.txt') as f:
#     skip = [int(line.strip()) for line in f]

# bird_samples = process_samples_bird(all_data, bird_tables, skip=skip)
# train_samples, dev_samples, test_samples = split_train_dev_test(bird_samples, train_ratio=0.6, dev_ratio=0.2, seed=seed)

# save_samples_spider_bird(train_samples, proj_path / 'data' / 'bird_train.json')
# save_samples_spider_bird(dev_samples, proj_path / 'data' / 'bird_dev.json')
# save_samples_spider_bird(test_samples, proj_path / 'data' / 'bird_test.json')
# print(len(train_samples), len(dev_samples), len(test_samples))

In [4]:
# experiment_folder = proj_path / 'experiments' / 'bird'
# prediction_path = experiment_folder / 'predictions' / 'create_bo'
# tables = bird_tables
# bos = []
# for p in prediction_path.glob('bird_train_bo_*.json'):
#     with p.open() as f:
#         bos = json.load(f)

#     db_id = p.stem.split('_', 3)[-1]
#     schema = Schema(tables[db_id].db_schema)
#     for bo in bos:
#         output = extract_all(bo['gold_sql'], schema)
#         bo['gold_complexity'] = get_complexity(output)
    
#     with p.open('w') as f:
#         json.dump(bos, f, indent=4)

# bos = {}
# for p in prediction_path.glob('bird_train_bo_*.json'):
#     db_id = p.stem.split('_', 3)[-1]
#     with p.open() as f:
#         bos[db_id] = json.load(f)

# with (experiment_folder / 'predictions' / 'create_bo' / f'final_bird_train_bo.json').open('w') as f:
#     json.dump(bos, f, indent=4)

In [14]:
from run_bo_sql import Sampler, get_vector_store, get_retriever
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
k_retrieval: int = 5  # for test
n_retrieval: int = 1   # for test
score_threshold: float = 0.65
use_reranker: bool = True
# TODO: run spider 4567
ds = 'bird'
task = 'zero_shot'
experiment_folder = proj_path / 'experiments' / ds
prediction_path = experiment_folder / 'predictions' / task
eval_path = experiment_folder / 'evals' / task

bo_path = experiment_folder / 'predictions' / 'create_bo' / f'final_{ds}_train_bo.json'
with bo_path.open() as f:
    bos = json.load(f)

samples = load_samples_spider_bird(proj_path / 'data' / f'{ds}_dev.json')
df = pd.read_csv(experiment_folder / 'evals' / 'zero_shot' / f'bird_dev.csv')
df_score = df.loc[:, ['sample_id', 'db_id', 'exec_result']]
df_error = df_score.loc[df_score['exec_result'] == 0, ['db_id', 'sample_id']]
error_ids = df_error['sample_id'].tolist()
samples = list(filter(lambda x: x.sample_id in error_ids, samples))


samples_by_db_id = defaultdict(list)
for sample in samples:
    samples_by_db_id[sample.db_id].append(sample)

cross_encoder = HuggingFaceCrossEncoder(model_name='cross-encoder/ms-marco-MiniLM-L-6-v2')

vectorstore = get_vector_store(bos)
# dev_samples = load_samples_spider_bird(proj_path / 'data' / f'{ds}_dev.json')
# pred_res = defaultdict(dict)  # db_id -> train_bo -> list[dict]
# for p in prediction_path.glob(f'{ds}_dev_*.json'):
#     name = p.stem.split('_', 2)[-1]
#     db_id, idx = name.split('-')
#     with p.open() as f:
# 
#     for r in res:
#         train_bo_id = r['retrieved']
#         if not pred_res[db_id].get(train_bo_id):
#             pred_res[db_id][train_bo_id] = []
#         pred_res[db_id][r['retrieved']].append(r)
#     break

# save_path = prediction_path / f'final_{ds}_dev.jsonl'

In [15]:
sample_ids = []

for db_id, samples in samples_by_db_id.items():
    retriever = vectorstore.as_retriever(
        search_type='similarity_score_threshold',
        search_kwargs={
            'k': k_retrieval, 
            'score_threshold': score_threshold, 
            'filter': {'db_id': db_id, 'sample_id': {'$nin' : sample_ids}},
        }
    )
    break

In [16]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7ff2c8db3e90>, search_type='similarity_score_threshold', search_kwargs={'k': 5, 'score_threshold': 0.65, 'filter': {'db_id': 'movie_platform', 'sample_id': {'$nin': []}}})

In [11]:
df = pd.read_csv(eval_path / f'bird_dev.csv')
df_score = df.loc[:, ['sample_id', 'db_id', 'exec_result']]
df_score['exec_result'].sum() / len(df_score)

0.3385365853658537

In [91]:
from itertools import product
n_sample = 3
n_stop = 50
typ = 'dev'
samples = load_samples_spider_bird(proj_path / 'data' / f'{ds}_{typ}.json')
df = pd.read_csv(experiment_folder / 'evals' / 'zero_shot' / f'{ds}_dev.csv')
df_score = df.loc[:, ['sample_id', 'db_id', 'exec_result']]
df_error = df_score.loc[df_score['exec_result'] == 0, ['db_id', 'sample_id']]
error_ids = df_error['sample_id'].tolist()
samples = list(filter(lambda x: x.sample_id in error_ids, samples))

with open(experiment_folder / f'partial_{ds}_db_ids.json') as f:
    partial_db_ids = json.load(f)

bo_path = experiment_folder / 'predictions' / 'create_bo' / f'final_{ds}_train_bo.json'
assert bo_path.exists(), 'Run with the `task=create_bo, type=train` first'
with bo_path.open() as f:
    bos = json.load(f)

# with open(experiment_folder / f'partial_{ds}_db_ids.json', 'w') as f:
#     json.dump(partial_db_ids, f, indent=4)

sampler = Sampler(bos)

sampled_bos = {}
for db_id_group in partial_db_ids:
    sampled_bos[str(db_id_group)] = defaultdict()
    for db_id in partial_db_ids[str(db_id_group)]:
        x_samples = list(filter(lambda x: x.db_id == db_id, samples))
        for idx_bos, train_bos in enumerate(sampler.sample(db_id, n_sample, n_stop, rt_idx=False)):
            # print(f'{db_id}-{idx_bos} :', f'{len(train_bos)}', f'{len(list(product(train_bos, x_samples)))}')
            sampled_bos[str(db_id_group)][f'{db_id}-{idx_bos}'] = {
                'train_bos': train_bos,
                'n_iter': len(list(product(train_bos, x_samples))), 
                'total_bos_in_batch': len(train_bos),
                'total_samples_in_batch': len(x_samples)
            }

In [5]:
ds = 'bird'
task = 'zero_shot'
experiment_folder = proj_path / 'experiments' / ds
prediction_path = experiment_folder / 'predictions' / task
eval_path = experiment_folder / 'evals' / task
typ = 'dev'
samples = load_samples_spider_bird(proj_path / 'data' / f'{ds}_{typ}.json')
df = pd.read_csv(experiment_folder / 'evals' / 'zero_shot' / f'{ds}_dev.csv')
df_error = df.loc[df['exec_result'] == 0]
error_ids = df_error['sample_id'].tolist()
samples = list(filter(lambda x: x.sample_id in error_ids, samples))

with (experiment_folder / f'partial_{ds}_batch.json').open('r') as f:
    partial_batch = json.load(f)

In [8]:
count_files = 0
for db_group_id, batch in partial_batch.items():
    count_files += len(batch)
count_files

194

In [172]:
new_partial_batch = defaultdict()
to_be = 30
for db_id_group, batches in partial_batch.items():
    new_batch = defaultdict(dict)
    db_id_count = defaultdict(int)
    for file_name, batch in batches.items():
        db_id, idx = file_name.split('-')
        x_samples = list(filter(lambda x: x.db_id == db_id, samples))
        if db_id_count[db_id] >= to_be:
            # print('drop ', file_name)
            continue

        if db_id_count[db_id] + len(batch['train_bos']) < to_be:
            train_bos = batch['train_bos']
            new_batch[file_name] = {
                'train_bos': train_bos,
                'n_iter': len(list(product(train_bos, x_samples))),
                'total_bos_in_batch': len(train_bos),
                'total_samples_in_batch': len(x_samples)
            }
            db_id_count[db_id] += len(train_bos)
        else: # count + len(batch['train_bos']) > to_be:
            n = to_be - db_id_count[db_id]
            train_bos = batch['train_bos'][:n]
            new_batch[file_name] = {
                'train_bos': train_bos,
                'n_iter': len(list(product(train_bos, x_samples))),
                'total_bos_in_batch': len(train_bos),
                'total_samples_in_batch': len(x_samples)
            }
            db_id_count[db_id] += len(train_bos)
    
    new_partial_batch[db_id_group] = new_batch
    
    print(f'[{db_id_group}] before: file = {len(batches)} bos=' , sum([len(v['train_bos']) for v in batches.values()]), 'n_iter=', sum([v['n_iter'] for v in batches.values()]))
    print(f'[{db_id_group}] after: file = {len(new_batch)} bos=' , sum([len(v['train_bos']) for v in new_batch.values()]), 'n_iter=', sum([v['n_iter'] for v in new_batch.values()]))
    print(f'[{db_id_group}] count = {db_id_count}')
# with (experiment_folder / f'partial_{ds}_batch.json').open('w') as f:
#     json.dump(new_partial_batch, f, indent=4)

[0] before: file = 82 bos= 644 n_iter= 8394
[0] after: file = 59 bos= 511 n_iter= 1957
[1] before: file = 50 bos= 411 n_iter= 4583
[1] after: file = 45 bos= 367 n_iter= 881
[2] before: file = 62 bos= 517 n_iter= 6707
[2] after: file = 46 bos= 396 n_iter= 1184
[3] before: file = 75 bos= 690 n_iter= 10838
[3] after: file = 52 bos= 505 n_iter= 1900
[4] before: file = 68 bos= 561 n_iter= 6246
[4] after: file = 53 bos= 470 n_iter= 1264
[5] before: file = 64 bos= 508 n_iter= 6218
[5] after: file = 50 bos= 409 n_iter= 1073
[6] before: file = 70 bos= 573 n_iter= 7298
[6] after: file = 52 bos= 436 n_iter= 1577
[7] before: file = 70 bos= 583 n_iter= 8072
[7] after: file = 50 bos= 434 n_iter= 1442


In [113]:
with (experiment_folder / f'partial_{ds}_batch-back.json').open('r') as f:
    partial_batch = json.load(f)

In [191]:
# 돌려 놓은거 처리
ds = 'bird'
task = 'valid_bo'
experiment_folder = proj_path / 'experiments' / ds
prediction_path = experiment_folder / 'predictions' / task
eval_path = experiment_folder / 'evals' / task

with (experiment_folder / f'partial_{ds}_batch.json').open('r') as f:
    new_partial_batch = json.load(f)

df = pd.read_csv(experiment_folder / 'evals' / 'zero_shot' / f'{ds}_dev.csv')
df_error = df.loc[df['exec_result'] == 0]
error_ids = df_error['sample_id'].tolist()
count_db_ids = defaultdict(int)
for p in sorted(prediction_path.glob(f'{ds}_dev_*.json')):
    name = p.stem.split('_', 2)[-1]
    db_id, idx = name.split('-')
    # if name == 'app_store-2':
    #     break
    with p.open() as f:
        res = json.load(f)
    found = False
    for db_id_group, batches in new_partial_batch.items():
        if name in batches:
            found = True
            train_bo_ids = [x['sample_id'] for x in batches[name]['train_bos']]
            break

    if not found:
        n_retrieved = set([x['retrieved'] for x in res])
        p.unlink()
        print(f'{name} not found: {len(n_retrieved)}, delete')
    else:
        n_retrieved = set([x['retrieved'] for x in res])
        count_db_ids[name] += len(n_retrieved)
        print(f'{name} before: n_retrieved = {len(n_retrieved)} | bos = {len(train_bo_ids)} | {len(res)}', end=' ')
        # error_ids 에 있는것만 남기기
        res = list(filter(lambda x: x['sample_id'] in error_ids, res))
        print(f' --> {len(res)}')
        # train_bo_ids 에 해당하는 bos만 남기기
        if len(res) < len(list(filter(lambda x: x['retrieved'] in train_bo_ids, res))):
            print(f'{name} before reduce bo: {len(res)}')
            res = list(filter(lambda x: x['retrieved'] in train_bo_ids, res))
            print(f'{name} after reduce bo: {len(res)}')

        with p.open('w') as f:
            json.dump(res, f, indent=4)

    # print()

app_store-0 before: n_retrieved = 12 | bos = 12 | 132  --> 60
app_store-1 before: n_retrieved = 12 | bos = 12 | 132  --> 60
app_store-2 before: n_retrieved = 6 | bos = 6 | 66  --> 30
app_store-3 not found: 3, delete
app_store-4 not found: 1, delete
authors-0 before: n_retrieved = 15 | bos = 15 | 510  --> 255
authors-1 before: n_retrieved = 15 | bos = 15 | 510  --> 255
authors-2 not found: 15, delete
authors-3 not found: 5, delete
books-0 before: n_retrieved = 15 | bos = 15 | 585  --> 180
books-1 before: n_retrieved = 15 | bos = 15 | 585  --> 180
books-2 not found: 15, delete
books-3 not found: 5, delete
california_schools-0 before: n_retrieved = 15 | bos = 15 | 255  --> 225
california_schools-1 before: n_retrieved = 15 | bos = 15 | 255  --> 225
california_schools-2 not found: 13, delete
california_schools-3 not found: 6, delete
california_schools-4 not found: 1, delete
college_completion-0 before: n_retrieved = 15 | bos = 15 | 225  --> 225
college_completion-1 before: n_retrieved = 15 

In [171]:
db_ids = list(bos.keys())
partial_db_ids = {}
n = 20
for i in range(30):
    if db_ids[i*n:(i+1)*n]:
        partial_db_ids[i] = db_ids[i*n:(i+1)*n]
print(partial_db_ids.keys())

with open(experiment_folder / f'partial_{ds}_db_ids.json', 'w') as f:
    json.dump(partial_db_ids, f, indent=4)

dict_keys([0, 1, 2, 3, 4, 5, 6, 7])


In [172]:
with open(experiment_folder / f'partial_{ds}_db_ids.json') as f:
    partial_db_ids = json.load(f)

sampler = Sampler(bos)

In [174]:
from itertools import product, islice

def batched(iterable, n, *, strict=False):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        if strict and len(batch) != n:
            raise ValueError('batched(): incomplete batch')
        yield batch

sampled_ids = {}
for db_id_group in partial_db_ids:
    sampled_ids[str(db_id_group)] = defaultdict()
    for db_id in partial_db_ids[str(db_id_group)]:
        x_samples = list(filter(lambda x: x.db_id == db_id, dev_samples))
        for idx_bos, train_bos in enumerate(sampler.sample(db_id, 3, 50, rt_idx=False)):
            # print(f'{db_id}-{idx_bos} :', f'{len(train_bos)}', f'{len(list(product(train_bos, x_samples)))}')
            sampled_ids[str(db_id_group)][f'{db_id}-{idx_bos}'] = {
                'train_bos': train_bos,
                'n_iter': len(list(product(train_bos, x_samples))), 
                'total_bos_in_batch': len(train_bos)
            }

with (experiment_folder / f'partial_{ds}_batch.json').open('w') as f:
    json.dump(sampled_ids, f, indent=4)

In [175]:
for db_id_group in partial_db_ids:
    print(len(sampled_ids[str(db_id_group)]))
    niters = [x['n_iter'] for x in sampled_ids[str(db_id_group)].values()]
    print(f'n_iter: {sum(niters)}, iter per file: {np.mean(niters):.2f}')

82
n_iter: 8394, iter per file: 102.37
50
n_iter: 4583, iter per file: 91.66
62
n_iter: 6707, iter per file: 108.18
75
n_iter: 10838, iter per file: 144.51
68
n_iter: 6246, iter per file: 91.85
64
n_iter: 6218, iter per file: 97.16
70
n_iter: 7298, iter per file: 104.26
70
n_iter: 8072, iter per file: 115.31


In [6]:
df = []
for db_id, bs in bos.items():
    for b in bs:
        res = {'db_id': db_id, 'gold_complexity': b['gold_complexity']}
        df.append(res)

df = pd.DataFrame(df)

In [115]:
from itertools import pairwise

def _format_interval(x: pd.Interval):
    return pd.Interval(
        left=int(np.floor(x.left)), 
        right=int(np.floor(x.right)),
        closed=x.closed
    )

def _get_categories(s: pd.Series):
    tiles = [0, 0.2, 0.4, 0.6, 0.8, 1]
    df = pd.qcut(s, q=tiles, duplicates='drop')
    return df

def _get_df_from_bos(bos):
    df = []
    for db_id, bs in bos.items():
        for b in bs:
            res = {'db_id': db_id}
            res.update(b)
            df.append(res)
    df = pd.DataFrame(df)
    df_cates = df.groupby('db_id')['gold_complexity'].apply(_get_categories)
    df_cates = df_cates.rename('category').apply(_format_interval)
    df = df.merge(df_cates.reset_index('db_id', drop=True), left_index=True, right_index=True)
    return df

In [None]:
ds = 'bird'
task = 'zero_shot_hint'
typ = 'dev'
experiment_folder = proj_path / 'experiments' / ds
prediction_path = experiment_folder / 'predictions' / task
eval_path = experiment_folder / 'evals' / task

# file_name = f'{ds}_{typ}_parsed.pkl'
# with (eval_path / file_name).open('rb') as f:
#     target_parsed = pickle.load(f)

In [308]:
prediction_path.parent.parent

PosixPath('/home/simonjisu/code/BusinessObjects/experiments/bird')

In [304]:
bos['address'][:4]

[{'sample_id': 5156,
  'vt': "SELECT area_code.area_code, country.county FROM area_code INNER JOIN country AS T2 ON T1.zip_code = T2.zip_code INNER JOIN zip_data AS T3 ON T1.zip_code = T3.zip_code WHERE zip_data.city = '[placeholder-type:string]'",
  'ba': "The virtual table provides the area code and county information for a specific city based on its zip code. It combines data from the 'area_code', 'country', and 'zip_data' tables, filtering results to match the specified city name.",
  'gold_complexity': 10,
  'gold_sql': "SELECT T1.area_code, T2.county FROM area_code AS T1 INNER JOIN country AS T2 ON T1.zip_code = T2.zip_code INNER JOIN zip_data AS T3 ON T1.zip_code = T3.zip_code WHERE T3.city = 'Savoy'"},
 {'sample_id': 5211,
  'vt': 'SELECT alias.alias FROM alias INNER JOIN zip_data AS T2 ON T1.zip_code = T2.zip_code WHERE zip_data.population_2020 = (SELECT MAX(zip_data.population_2020) FROM zip_data)',
  'ba': "The virtual table retrieves the aliases of cities from the 'alias' t

In [4]:
from pydantic import BaseModel
from langchain_openai import ChatOpenAI

from langchain_community.callbacks.manager import get_openai_callback

class Out(BaseModel):
    response: str

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    stream_usage=True,
)
model = llm.with_structured_output(Out)


with get_openai_callback() as cb:
    result = model.invoke("Tell me a joke with JSON format")
    print(cb)

Tokens Used: 85
	Prompt Tokens: 51
	Completion Tokens: 34
Successful Requests: 1
Total Cost (USD): $2.805e-05


In [63]:
samples_by_db_id = defaultdict(list)
for sample in train_samples:
    samples_by_db_id[sample.db_id].append(sample)

x = []
for db_id, samples in samples_by_db_id.items():
    x.append(len(samples))

print(np.mean(x), np.std(x), np.min(x), np.max(x))

80.26582278481013 46.229123611557306 11 280


In [64]:
samples_by_db_id = defaultdict(list)
for sample in dev_samples:
    samples_by_db_id[sample.db_id].append(sample)

x = []
for db_id, samples in samples_by_db_id.items():
    x.append(len(samples))

print(np.mean(x), np.std(x), np.min(x), np.max(x))

26.468354430379748 15.462355628942769 3 93


In [119]:
# with open(proj_path / 'data' / 'pkl_files' / 'bird_train_parsed.pkl', 'rb') as f:
#     train_parsed = pickle.load(f)

# # prediction parsed
# with open(proj_path / 'data' / 'pkl_files' / 'bird_dev_parsed.pkl', 'rb') as f:
#     dev_parsed = pickle.load(f)

In [17]:
eval_path = proj_path / 'experiments' / 'bird' / 'evals' / 'zero_shot'

df = []
for p in eval_path.glob('bird_dev_*.json'):
    with p.open() as f:
        for line in f:
            eval_data = json.loads(line)
            df.append(eval_data)

df = pd.DataFrame(df)
df.to_csv(eval_path / 'bird_dev.csv', index=False)

In [16]:
df['gold_complexity'].agg(['mean', 'std', 'min', 'max', 'median'])

mean      0.450361
std       0.055482
min       0.318118
max       0.726155
median    0.446118
Name: gold_complexity, dtype: float64

In [31]:
prediction_path = proj_path / 'experiments' / 'bird' / 'predictions' / 'create_bo'
bos = defaultdict(list)
for p in prediction_path.glob('bird_train_bo_*.json'):
    with p.open() as f:
        temp = json.load(f)
    
    bos[p.stem.split('_', 3)[-1]] = temp

# with (prediction_path / 'final_bird_train_bo.json').open('w') as f:
#     json.dump(bos, f, indent=4)

In [178]:
vector_store = get_vector_store({'address': bos['address'][:10]})

In [179]:
[b['sample_id'] for b in bos['address'][:10]]

[5156, 5211, 5227, 5091, 5152, 5128, 5200, 5119, 5194, 5141]

In [161]:
bos['address'][:10][0]

{'sample_id': 5156,
 'vt': "SELECT area_code.area_code, country.county FROM area_code INNER JOIN country AS T2 ON T1.zip_code = T2.zip_code INNER JOIN zip_data AS T3 ON T1.zip_code = T3.zip_code WHERE zip_data.city = '[placeholder-type:string]'",
 'ba': "The virtual table provides the area code and county information for a specific city based on its zip code. It combines data from the 'area_code', 'country', and 'zip_data' tables, filtering results to match the specified city name."}

In [180]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
base_retriever = vector_store.as_retriever(
    search_type='similarity_score_threshold', 
    search_kwargs={
        'k': 3,
        'score_threshold': 0.3, 'filter': {'sample_id': {'$nin': []}}
    }
)

# 'lambda_mult': 0.5  'score_threshold': 0.0
# 'filter': {'sample_id': {'$in': [5156]}}}
model = HuggingFaceCrossEncoder(model_name='cross-encoder/ms-marco-MiniLM-L-6-v2')
compressor = CrossEncoderReranker(model=model, top_n=1)
retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=base_retriever
)

In [181]:
q = 'what is the aliases of cities along with their elevation?'
x = base_retriever.invoke(q)
x

No relevant docs were retrieved using the relevance score threshold 0.3


[]

In [182]:
x = vector_store.similarity_search_with_relevance_scores(
    q, k=2, filter={'sample_id': {'$nin': [5152, 5211, 5194]}})
x
# similarity_search_with_relevance_scores

[]

In [148]:
docs_and_similarities = [
    (doc, similarity)
    for doc, similarity in x
    if similarity >= 0.5
]
docs_and_similarities

[(Document(metadata={'sample_id': 5152, 'db_id': 'address', 'vt': 'SELECT alias.alias, zip_data.elevation FROM alias INNER JOIN zip_data AS T2 ON T1.zip_code = T2.zip_code WHERE alias.zip_code = [placeholder-type:numeric]'}, page_content="The virtual table describes the aliases of cities along with their elevation from the 'zip_data' table. The query joins the 'alias' table with the 'zip_data' table based on the zip code, filtering for a specific zip code using a placeholder for numeric values."),
  0.7825041385389271),
 (Document(metadata={'sample_id': 5211, 'db_id': 'address', 'vt': 'SELECT alias.alias FROM alias INNER JOIN zip_data AS T2 ON T1.zip_code = T2.zip_code WHERE zip_data.population_2020 = (SELECT MAX(zip_data.population_2020) FROM zip_data)'}, page_content="The virtual table retrieves the aliases of cities from the 'alias' table that correspond to the zip codes with the highest population recorded in 2020 from the 'zip_data' table. The query uses an inner join to connect t

# Predict BO

In [117]:
def get_vector_store(bos: dict[str, list[dict[str, str]]]):
    documents = []
    for db_id, samples in bos.items():
        for x in samples:
            doc = Document(
                doc_id=x['sample_id'],
                page_content=x['ba'],
                metadata={
                    'sample_id': x['sample_id'],
                    'db_id': db_id,
                    'vt': x['vt']
                }
            )
            documents.append(doc)

    embeddings_model = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(
        documents, 
        embedding = embeddings_model,
    )
    return vectorstore

In [None]:
vectorstore = get_vector_store(res)

In [156]:
def predict_sql_bo(
    to_pred_samples: list[SpiderSample|BirdSample],
    tables: dict[DatabaseModel],
    vectorstore: FAISS,
    chain: RunnableSequence,
    prediction_path: Path,
    file_name: str = '[args.ds]_[args.type]',
    n_retrieval: int = 3,
    score_threshold: float = 0.65,
):
    processed_db_ids = [p.stem.split('_')[-1] for p in prediction_path.glob(f'{file_name}_*')]
    # restart from checkpoint
    if processed_db_ids:
        to_pred_samples = [sample for sample in to_pred_samples if sample.db_id not in processed_db_ids]
    
    samples_by_db_id = defaultdict(list)
    for sample in to_pred_samples:
        samples_by_db_id[sample.db_id].append(sample)

    for db_id, samples in samples_by_db_id.items():
        retriever = vectorstore.as_retriever(
            search_kwargs={'k': n_retrieval, 'score_threshold': score_threshold, 'filter': {'db_id': db_id}}
        )
        schema_str = get_schema_str(
            schema=tables[db_id].db_schema, 
            foreign_keys=tables[db_id].foreign_keys,
            col_explanation=tables[db_id].col_explanation
        )
        results = []
        for sample in tqdm(samples, total=len(samples), desc=f"{db_id}"):
            question = sample.final.question
            docs = retriever.invoke(question)
            hint = '\nDescriptions and Virtual Tables:\n'
            hint += json.dumps({j: {'description': doc.page_content, 'virtual_table': doc.metadata['vt']} for j, doc in enumerate(docs)}, indent=4)
            hint += '\n'
            input_data = {'schema': schema_str, 'input_query': question, 'hint': hint}
            output = chain.invoke(input=input_data)
            
            full_sql_output = {}
            full_sql_output['sample_id'] = sample.sample_id
            full_sql_output['rationale'] = output.rationale
            full_sql_output['pred_sql'] = output.full_sql_query
            # full_sql_output = 1
            results.append(full_sql_output)

        with open(prediction_path / f'{file_name}_{db_id}.json', 'w') as f:
            json.dump(results, f, indent=4)

In [157]:

# with open(proj_path / 'data' / 'pkl_files' / 'bird_train_bo.json', 'r') as f:
#     bos = json.load(res, f, indent=4)
# vectorstore = get_vector_store(bos)


data_path = proj_path / 'data' / 'bird'
experiment_folder = proj_path / 'experiments' / 'bird'
prediction_path = experiment_folder / 'predictions' / 'zero_shot_hint'
eval_path = experiment_folder / 'evals'
for p in [prediction_path, eval_path]:
    if not p.exists():
        p.mkdir(parents=True)

prompt = PromptTemplate(
    template=Prompts.zero_shot_hints_inference,
    input_variables=['schema', 'input_query', 'hint'],
)

model_openai = ChatOpenAI(
    model='gpt-4o-mini',
    temperature=0.0,
    frequency_penalty=0.1,
)

model = model_openai.with_structured_output(SQLResponse)
chain = (prompt | model)

n_retrieval = 3
score_threshold = 0.65

predict_sql_bo(
    to_pred_samples=dev_samples[:10],
    tables=bird_tables,
    vectorstore=vectorstore,
    chain=chain,
    prediction_path=prediction_path,
    n_retrieval=n_retrieval,
    score_threshold=score_threshold,
    file_name='bird_dev',
)

movie_platform: 100%|██████████| 10/10 [00:04<00:00,  2.34it/s]


In [138]:
docs = retriever.invoke(sample.final.question)
hint = '\nDescriptions and Virtual Tables:\n'
hint += json.dumps({j: {'description': doc.page_content, 'virtual_table': doc.metadata['vt']} for j, doc in enumerate(docs)}, indent=4)
hint += '\n'
input_data = {'schema': db_schema, 'input_query': row['question'], 'hint': hint}
output = chain.invoke(input=input_data)

print(hint)


Descriptions and Virtual Tables:
{
    "0": {
        "description": "The virtual table retrieves the titles of movies that have been rated, filtering by a specific rating timestamp and grouping the results by movie title. The results are ordered by the count of ratings for each movie title, and a limit is applied to restrict the number of returned titles.",
        "virtual_table": "SELECT movies.movie_title FROM ratings INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE ratings.rating_timestamp_utc LIKE '[placeholder-type:string]' GROUP BY movies.movie_title ORDER BY COUNT(movies.movie_title) LIMIT [placeholder-type:numeric]"
    },
    "1": {
        "description": "The virtual table provides a count of users who have rated a specific movie, identified by its title, while also filtering for users who were trialists at the time of rating. It combines data from the 'ratings' and 'movies' tables to achieve this.",
        "virtual_table": "SELECT COUNT(ratings.user_id) FROM ra

# Similarity between dataset

In [125]:
def get_parsed_sql(samples: dict, tables: dict):
    error_ids = []
    parsed = defaultdict(dict)
    iterator = tqdm(samples, total=len(samples))
    for sample in iterator:
        db_id = sample.db_id
        sample_id = sample.sample_id
        iterator.set_description(f"{db_id}")
        schema = Schema(tables[db_id].db_schema)
        sql_i = sample.final.sql
        try:
            ei = extract_all(sql_i, schema)
            assert len(ei['sel']) > 0, f'No selection found-{db_id}-{sample_id}'
        except Exception as e:
            error_ids.append((db_id, sample_id, str(e)))
            parsed[db_id].append(None)
            continue
        parsed[db_id][sample_id] = ei
    return parsed, error_ids

train_parsed, error_ids = get_parsed_sql(train_samples, bird_tables)
dev_parsed, error_ids = get_parsed_sql(dev_samples, bird_tables)
test_parsed, error_ids = get_parsed_sql(test_samples, bird_tables)

movie_platform:   0%|          | 0/6341 [00:00<?, ?it/s]

debit_card_specializing: 100%|██████████| 6341/6341 [00:21<00:00, 299.00it/s]    
debit_card_specializing: 100%|██████████| 2091/2091 [00:05<00:00, 408.98it/s]    
debit_card_specializing: 100%|██████████| 2193/2193 [00:09<00:00, 225.64it/s]    


In [129]:
# with open(proj_path / 'data' / 'pkl_files' / 'bird_train_parsed.pkl', 'wb') as f:
#     pickle.dump(train_parsed, f)

# with open(proj_path / 'data' / 'pkl_files' / 'bird_dev_parsed.pkl', 'wb') as f:
#     pickle.dump(dev_parsed, f)

# with open(proj_path / 'data' / 'pkl_files' / 'bird_test_parsed.pkl', 'wb') as f:
#     pickle.dump(test_parsed, f)

with open(proj_path / 'data' / 'pkl_files' / 'bird_dev_parsed.pkl', 'rb') as f:
    dev_parsed = pickle.load(f)

with open(proj_path / 'data' / 'pkl_files' / 'bird_test_parsed.pkl', 'rb') as f:
    test_parsed = pickle.load(f)

In [None]:
from itertools import combinations, product
from collections import defaultdict
from src.eval_utils import get_all_partial_score

def measure_inter_score(parsed1: dict[str, tuple], parsed2: dict[str, tuple]):
    results = defaultdict()
    assert len(parsed1) == len(parsed2), f"Length mismatch-1: {len(parsed1)} 2:{len(parsed2)}"
    db_ids = list(parsed1.keys())
    for db_id in db_ids:
        o1 = parsed1[db_id]
        o2 = parsed2[db_id]
        n1 = len(o1)
        n2 = len(o2)
        semantic_sim = np.zeros((n1, n2), dtype=np.float32)
        structural_sim = np.zeros((n1, n2), dtype=np.float32)
        overall_sim = np.zeros((n1, n2), dtype=np.float32)

        idxs = list(product(range(n1), range(n2)))
        iterator = tqdm(idxs, total=len(idxs), desc=f"{db_id}")
        for i, j in iterator:
            ei = o1[i]
            ej = o2[j]

            _, final_score = get_all_partial_score(ei, ej, use_bert=True)

            structural_sim[i, j] = final_score['structural']
            semantic_sim[i, j] = final_score['semantic']
            overall_sim[i, j] = final_score['overall']

        results[db_id] = {
            'semantic': semantic_sim,
            'struct': structural_sim,
            'overall': overall_sim
        }
    return results

results = measure_inter_score(dev_parsed, test_parsed)
with (proj_path / 'data' / 'pkl_files' / 'bird_dev_test_similarity.pkl').open('wb') as f:
    pickle.dump(results, f)

# Complexity between datasets

In [12]:
def measure_complexity(samples, tables):
    cs = []
    for s in tqdm(samples, total=len(samples)):
        schema = Schema(tables[s.db_id].db_schema)
        output = extract_all(s.final.sql, schema)
        complexity = get_complexity(output)
        cs.append(complexity)
    return cs

train_complexities = measure_complexity(train_samples, bird_tables)
dev_complexities = measure_complexity(dev_samples, bird_tables)
test_complexities = measure_complexity(test_samples, bird_tables)

100%|██████████| 6341/6341 [00:10<00:00, 631.17it/s]
100%|██████████| 2091/2091 [00:03<00:00, 589.55it/s]
100%|██████████| 2193/2193 [00:03<00:00, 591.64it/s]


In [22]:
for c, n in zip([train_complexities, dev_complexities, test_complexities], ['train', 'dev  ', 'test ']):
    print(f'[{n}] Mean={np.mean(c):.4f} +/-{np.std(c):.4f}, Median={np.median(c):.4f}')

[train] Mean=0.2753 +/-0.0476, Median=0.2710
[dev  ] Mean=0.2758 +/-0.0471, Median=0.2710
[test ] Mean=0.2760 +/-0.0477, Median=0.2709


In [None]:
stats = defaultdict(list)
for s in dev_samples:
    stats[s.db_id].append(s)