In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path

import sqlparse
from src.database import SqliteDatabase
from src.eval import result_eq, check_if_exists_orderby
from src.eval_complexity import eval_all
from src.process_sql import get_schema, Schema
from src.parsing_sql import (
    extract_selection, 
    extract_condition, 
    extract_aggregation, 
    extract_nested_setoperation, 
    extract_others,
    extract_aliases,
)
proj_path = Path('.').resolve()


# Complexity

In [7]:
df_train = pd.read_csv(proj_path / 'data' / 'split_in_domain' / 'spider_bo_desc_train.csv')
df_test = pd.read_csv(proj_path / 'data' / 'split_in_domain' / 'test.csv')
df_pred = pd.read_csv(proj_path / 'experiments' / 'bo_evals' / 'test_exp1.csv')
df_test = pd.merge(df_test, df_pred, on='sample_id')

In [8]:
def get_error_infos(df_test):

    iterator = tqdm(df_test.iterrows(), total=len(df_test))
    error_infos = {
        'pred_exec': [],
        'result': [],
        'parsing_sql': [],
        'error_samples': set(),
    }

    test_cols = ['c_low', 'c_mid', 'c_high', 't_1',  't_2',  't_3+']
    for i, x in iterator:
        has_error = False
        schema = get_schema(str(proj_path / 'data' / 'spider' / 'database' / x['db_id'] / f'{x["db_id"]}.sqlite'))
        schema = Schema(schema)
        
        for test_col in test_cols:
            try:
                sql = x[test_col]
                statement = sqlparse.parse(sql.strip())[0]
                aliases = extract_aliases(statement)
                selection = extract_selection(statement, aliases, schema)
                condition = extract_condition(statement, aliases, schema)
                aggregation = extract_aggregation(statement, aliases, schema)
                nested = extract_nested_setoperation(statement)
                others = extract_others(statement, aliases, schema)

            except Exception as e:
                has_error = True
                error_infos['parsing_sql'].append((x['sample_id'], test_col, str(e)))
                error_infos['error_samples'].add(x['sample_id'])
                break
        
        if has_error:
            continue

        iterator.set_description_str(f'error samples {len(error_infos["error_samples"])}')

    print(f'Parsing SQL errors: {len(error_infos["parsing_sql"])}')
    return error_infos

error_infos = get_error_infos(df_test)

error samples 33: 100%|██████████| 2018/2018 [00:17<00:00, 117.71it/s]

Parsing SQL errors: 33





In [9]:
# def process_task(task, error_infos):
test_cols = ['c_low', 'c_mid', 'c_high', 't_1',  't_2',  't_3+']
eval_cols = ['score', 's_sel', 's_cond', 's_agg', 's_nest', 's_oth']

df = df_test.loc[~df_test['sample_id'].isin(error_infos['error_samples'])].reset_index(drop=True)
for test_col in test_cols:
    df_exp = df.loc[:, ['sample_id', 'db_id', 'gold_sql', test_col]]
    iterator = tqdm(df_exp.iterrows(), total=len(df_exp), desc=f'Processing {test_col}')
    # init task eval results
    task_results = {'sample_id': []}
    for col in eval_cols:
        task_results[f'{test_col}_{col}'] = []

    for i, x in iterator:
        task_results['sample_id'].append(x['sample_id'])
        # parsing sql
        schema = get_schema(str(proj_path / 'data' / 'spider' / 'database' / x['db_id'] / f'{x["db_id"]}.sqlite'))
        schema = Schema(schema)
        
        # partial & complexity eval
        parsed_result = {}
        for k in ['gold', 'pred']:
            sql = x[test_col] if k == 'pred' else x['gold_sql']
            statement = sqlparse.parse(sql.strip())[0]
            aliases = extract_aliases(statement)
            selection = extract_selection(statement, aliases, schema)
            condition = extract_condition(statement, aliases, schema)
            aggregation = extract_aggregation(statement, aliases, schema)
            nested = extract_nested_setoperation(statement)
            others = extract_others(statement, aliases, schema)

            parsed_result[k + '_selection'] = selection
            parsed_result[k + '_condition'] = condition
            parsed_result[k + '_aggregation'] = aggregation
            parsed_result[k + '_nested'] = nested
            parsed_result[k + '_others'] = {
                'distinct': others['distinct'], 
                'order by': others['order by'], 
                'limit': others['limit']
            }

        eval_res = eval_all(parsed_result, k=6)
        task_results[f'{test_col}_s_sel'].append(eval_res['score']['selection'])
        task_results[f'{test_col}_s_cond'].append(eval_res['score']['condition'])
        task_results[f'{test_col}_s_agg'].append(eval_res['score']['aggregation'])
        task_results[f'{test_col}_s_nest'].append(eval_res['score']['nested'])
        task_results[f'{test_col}_s_oth'].append(eval_res['score']['others'])
        
        # execution eval
        database = SqliteDatabase(
            str(proj_path / 'data' / 'spider' / 'database' / x['db_id'] / f'{x["db_id"]}.sqlite')
        )
        error_info = ''
        try:
            pred_result = database.execute(x[test_col], rt_pandas=False)
        except Exception as e:
            pred_result = []
            error_info = 'Predction Execution Error:' + str(e)
            score = 0

        try:
            gold_result = database.execute(x['gold_sql'], rt_pandas=False)
        except Exception as e:
            error_info = 'Gold Execution Error:' + str(e)

        if 'Gold Execution Error' in error_info:
            continue
        elif 'Predction Execution Error' in error_info:
            task_results[f'{test_col}_score'].append(score)
            continue
        else:
            exists_orderby = check_if_exists_orderby(x['gold_sql'])
            score = int(result_eq(pred_result, gold_result, order_matters=exists_orderby))
            task_results[f'{test_col}_score'].append(score)

    df_temp = pd.DataFrame(task_results)
    df_test = pd.merge(df_test, df_temp, on='sample_id', how='left')
    df_temp.to_csv(proj_path / 'experiments' / 'bo_evals' / f'{test_col}.csv', index=False)
    # return task_results

Processing c_low:   0%|          | 0/1985 [00:00<?, ?it/s]

Processing c_low: 100%|██████████| 1985/1985 [00:20<00:00, 98.85it/s] 
Processing c_mid: 100%|██████████| 1985/1985 [00:19<00:00, 100.32it/s]
Processing c_high: 100%|██████████| 1985/1985 [00:18<00:00, 104.50it/s]
Processing t_1: 100%|██████████| 1985/1985 [00:20<00:00, 97.58it/s] 
Processing t_2: 100%|██████████| 1985/1985 [00:22<00:00, 89.15it/s] 
Processing t_3+: 100%|██████████| 1985/1985 [00:24<00:00, 81.25it/s] 


In [10]:
df_test.to_csv(proj_path / 'experiments' / 'bo_evals' / 'all.csv', index=False)

# Numbers of BO

In [22]:
df_train = pd.read_csv(proj_path / 'data' / 'split_in_domain' / 'spider_bo_desc_train.csv')
df_pm_stats = df_train.groupby(['db_id'])['pm_score_rank'].describe().loc[:, ['min', '25%', '50%', '75%', 'max']]
df_pm_stats

Unnamed: 0_level_0,min,25%,50%,75%,max
db_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
activity_1,1.0,6.00,12.0,21.0,21.0
aircraft,1.0,5.00,8.0,14.0,14.0
allergy_1,1.0,8.00,8.0,21.0,21.0
apartment_rentals,1.0,7.00,14.0,14.0,14.0
architecture,1.0,2.00,6.0,10.0,10.0
...,...,...,...,...,...
wine_1,1.0,8.25,9.0,22.0,22.0
workshop_paper,1.0,3.75,5.5,7.0,7.0
world_1,1.0,8.00,14.0,28.0,31.0
wrestler,1.0,2.00,7.0,7.0,7.0


In [None]:
from typing import Optional
from dotenv import load_dotenv, find_dotenv
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.runnables import RunnableSequence
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

_ = load_dotenv(find_dotenv())
proj_path = Path('.').resolve()

def filter_by_pm_score(x: pd.Series, df_pm_stats: pd.DataFrame, percentile: int):
    rank_criteria = df_pm_stats.loc[x['db_id'], f'{percentile}%']
    return x['pm_score_rank'] < rank_criteria

def get_vector_store(proj_path, percentile: Optional[str]=None):
    df_train = pd.read_csv(proj_path / 'data' / 'split_in_domain' / f'spider_bo_desc_train.csv')
    documents = []
    if percentile:
        df_pm_stats = df_train.groupby(['db_id'])['pm_score_rank'].describe().loc[:, ['25%', '50%', '75%']]
        pm_idx = df_train.apply(lambda x: filter_by_pm_score(x, df_pm_stats, percentile), axis=1)
        df_train = df_train.loc[pm_idx].reset_index(drop=True)

    for i, row in df_train.iterrows():
        doc = Document(
            doc_id=row['sample_id'],
            page_content=row['description'],
            metadata={
                'sample_id': row['sample_id'],
                'db_id': row['db_id'],
                'cate_gold_c': row['cate_gold_c'],
                'cate_len_tbls': row['cate_len_tbls'],
                'virtual_table': row['virtual_table']
            }
        )
        documents.append(doc)

    embeddings_model = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(
        documents, 
        embedding = embeddings_model,
        distance_strategy = DistanceStrategy.EUCLIDEAN_DISTANCE
    )
    return vectorstore