In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
proj_path = Path('.').resolve()
sys.path.append(str(proj_path))

import sqlglot
import numpy as np
from sqlglot import expressions as exp
from src.parsing_sql import Schema, extract_all
from src.eval_utils import (
    partial_match, 
    compute_tsed
)

from src.parsing_sql import (
    extract_aliases,
    extract_condition,
    get_subqueries,
    _extract_conditions,
    _extract_columns_from_expression,
    _determine_tag,
    _format_expression,
    _get_full_column_name,
    extract_aliases,
    extract_selection,
    extract_aggregation,
    extract_orderby,
    extract_others,
    
    _extract_aliases_from_select,
    _handle_table_or_subquery
)

In [3]:
schema_dict = {'lists': {'user_id': 'text',
  'list_id': 'text',
  'list_title': 'text',
  'list_movie_number': 'text',
  'list_update_timestamp_utc': 'text',
  'list_creation_timestamp_utc': 'text',
  'list_followers': 'text',
  'list_url': 'text',
  'list_comments': 'text',
  'list_description': 'text',
  'list_cover_image_url': 'text',
  'list_first_image_url': 'text',
  'list_second_image_url': 'text',
  'list_third_image_url': 'text'},
 'movies': {'movie_id': 'integer',
  'movie_title': 'integer',
  'movie_release_year': 'integer',
  'movie_url': 'integer',
  'movie_title_language': 'integer',
  'movie_popularity': 'integer',
  'movie_image_url': 'integer',
  'director_id': 'integer',
  'director_name': 'integer',
  'director_url': 'integer'},
 'ratings_users': {'user_id': 'integer',
  'rating_date_utc': 'integer',
  'user_trialist': 'integer',
  'user_subscriber': 'integer',
  'user_avatar_image_url': 'integer',
  'user_cover_image_url': 'integer',
  'user_eligible_for_trial': 'integer',
  'user_has_payment_method': 'integer'},
 'lists_users': {'user_id': 'text',
  'list_id': 'text',
  'list_update_date_utc': 'text',
  'list_creation_date_utc': 'text',
  'user_trialist': 'text',
  'user_subscriber': 'text',
  'user_avatar_image_url': 'text',
  'user_cover_image_url': 'text',
  'user_eligible_for_trial': 'text',
  'user_has_payment_method': 'text'},
 'ratings': {'movie_id': 'integer',
  'rating_id': 'integer',
  'rating_url': 'integer',
  'rating_score': 'integer',
  'rating_timestamp_utc': 'integer',
  'critic': 'integer',
  'critic_likes': 'integer',
  'critic_comments': 'integer',
  'user_id': 'integer',
  'user_trialist': 'integer',
  'user_subscriber': 'integer',
  'user_eligible_for_trial': 'integer',
  'user_has_payment_method': 'integer'}}

sqls = """
SELECT movie_release_year FROM movies WHERE movie_title = 'Cops'
SELECT T1.user_id FROM ratings AS T1 INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE rating_score = 4 AND rating_timestamp_utc LIKE '2013-05-04 06:33:32' AND T2.movie_title LIKE 'Freaks'
SELECT T1.user_trialist FROM ratings AS T1 INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE T2.movie_title = 'A Way of Life' AND T1.user_id = 39115684
SELECT T2.movie_title FROM ratings AS T1 INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE T1.rating_timestamp_utc LIKE '2020%' GROUP BY T2.movie_title ORDER BY COUNT(T2.movie_title) DESC LIMIT 1
SELECT AVG(T1.rating_score), T2.director_name FROM ratings AS T1 INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE T2.movie_title = 'When Will I Be Loved'
"""
schema = Schema(schema_dict)
sqls = [s.strip() for s in sqls.strip().split('\n')]

In [5]:
for sql in sqls:
    output = extract_all(sql, schema)
    # print
    print('SQL:', sql)
    print('# Selection')
    print(f'  unique columns: {output["sel"]}')
    for i, ast in enumerate(output['sel_asts']):
        print(f' [{i}] type: {ast[2]}')
        print(f' [{i}] ast:')
        print('  ' + repr(ast[1]))
    if output['cond_asts']:
        print('\n# condition')
        print(f'  operations: {output["op_types"]}')
        for i, ast in enumerate(output['cond_asts']):
            print(f' [{i}] {ast[0]}')
            print(f' [{i}] ast:')
            print('  ' + repr(ast[1]))
    if output['agg_asts']:
        print('\n# aggregation')
        print(f'  unique columns: {output["agg"]}')
        for i, ast in enumerate(output['agg_asts']):
            print(f' [{i}] {ast[0]}')
            print(f' [{i}] ast:')
            print('  ' + repr(ast[1]))
    if output['orderby_asts']:
        print('\n# orderby')
        print(f'  unique columns: {output["orderby"]}')
        for i, ast in enumerate(output['group_asts']):
            print(f' [{i}] {ast[0]}')
            print(f' [{i}] ast:')
            print('  ' + repr(ast[1]))
    
    if output['nested']:
        print('\n# nested')
        print(f'  number of nested: {output["nested"]}')
        # check the `output['subqueries']` if you waht to see the nested queries
        # first one is the original query
    if output['distinct']:
        print(f'\n# distinct: {output["distinct"]}')
    if output['limit']:
        print(f'\n# limit: {output["limit"]}')
    print('----------------------------------')

SQL: SELECT movie_release_year FROM movies WHERE movie_title = 'Cops'
# Selection
  unique columns: {'__movies.movie_release_year__'}
 [0] type: <select>
 [0] ast:
  Column(
  this=Identifier(this=movie_release_year, quoted=False),
  table=Identifier(this=movies, quoted=False))

# condition
  operations: {'eq'}
 [0] __movies.movie_title__ eq [placeholder-type:string]
 [0] ast:
  EQ(
  this=Column(
    this=Identifier(this=movie_title, quoted=False),
    table=Identifier(this=movies, quoted=False)),
  expression=Literal(this=[placeholder-type:string], is_string=True))

# nested
  number of nested: 1
----------------------------------
SQL: SELECT T1.user_id FROM ratings AS T1 INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE rating_score = 4 AND rating_timestamp_utc LIKE '2013-05-04 06:33:32' AND T2.movie_title LIKE 'Freaks'
# Selection
  unique columns: {'__ratings.user_id__'}
 [0] type: <select>
 [0] ast:
  Column(
  this=Identifier(this=user_id, quoted=False),
  table=Identif

# Measurement of Complexity

1. Tree Similarity Edit Distance
2. Set of unique columns, tables, types of functions 

* `n` = number of source asts
* `m` = number of target asts

```python
if n == m:
    # means that the number of source and target asts are the same
elif n > m:
    # means that the number of source asts are greater than the number of target asts
else:
    # means that the number of source asts are less than the number of target asts
```


Hungarian algorithm - https://hongl.tistory.com/159



In [169]:
# import sqlglot

sql1 = """SELECT T1.USER_ID 
FROM ratings AS T1 
INNER JOIN movies AS T2 
ON T1.movie_id = T2.movie_id 
WHERE 
    rating_score = 4 
    AND rating_timestamp_utc LIKE '2013-05-04 06:33:32' 
    AND T2.movie_title LIKE 'Freaks'
"""

sql2 = """SELECT T1.user_id, COUNT(T2.movie_title)
FROM ratings AS T1 
INNER JOIN movies AS T2 
ON T1.movie_id = T2.movie_id 
GROUP BY T1.user_id
HAVING COUNT(T2.movie_title) > 1
ORDER BY COUNT(T2.movie_title) DESC
"""
schema = Schema(schema_dict)

# sql1 = """SELECT
#   COUNT(*) AS count
# FROM lineitem
# WHERE
#   lineitem.l_commitdate < lineitem.l_receiptdate
#   AND lineitem.l_receiptdate >= '1993-01-01'
#   AND lineitem.l_receiptdate < '1994-01-01'
# """

# sql2 = """SELECT
#   COUNT(*) AS late_line_items_count
# FROM LINEITEM L
# WHERE
#   lineitem.L_RECEIPTDATE > lineitem.L_COMMITDATE
#   AND STRFTIME('%Y', lineitem.L_RECEIPTDATE) = 'abcd'"""

# schema = Schema({
#     'lineitem': {'l_receiptdate': 'date', 'l_commitdate': 'date'}
# })

output1 = extract_all(sql1, schema)
output2 = extract_all(sql2, schema)

formatted_sql1 = output1['subqueries'][0]
formatted_sql2 = output2['subqueries'][0]
tsed, distance = compute_tsed(formatted_sql1, formatted_sql2, build_type='apted')  # apted or zss
print('[SQL1]\n', formatted_sql1.sql(pretty=True))
print()
print('[SQL2]\n', formatted_sql2.sql(pretty=True))
print()
print(f'TSED: {tsed:.4f}')
print(f'Tree Edit Distance: {distance}')


# partial match
print('Partial Match Score')

from scipy.optimize import linear_sum_assignment 

def partial_matching_with_penalty(matrix: np.ndarray, penalty: float=0.0, maximize: bool=True, epsilon: float=1e-9):
    n, m = matrix.shape  # (# of source, # of target)
    size = max(n, m)
    score_matrix = np.full((size, size), -penalty, dtype=np.float32)
    score_matrix[:n, :m] = matrix
    row_ind, col_ind = linear_sum_assignment(score_matrix, maximize=maximize)
    total_score = (score_matrix[row_ind, col_ind] + epsilon).mean()
    return row_ind, col_ind, total_score

def get_score(
        source: list[exp.Expression], 
        target: list[exp.Expression], 
        build_type: str='apted',
        criteria: str='tsed',
        penalty: float=0.01,
    ):
    """
    n = len(source), m = len(target)
    if n == m: 
        it means that we can match all source to target 
        run partial matching with zero penalty
    if n != m: 
        it means that we can't match all source to target: either we over-guess or under-guess
        run partial matching with np.infty penalty
    criteria: tsed (max) or distance (min)
    """
    n = len(source)
    m = len(target)
    scores = np.zeros((n, m), dtype=np.float32)
    distance = np.zeros((n, m), dtype=np.float32)
    for i, ast1 in enumerate(source):
        for j, ast2 in enumerate(target):
            score, dis = compute_tsed(ast1, ast2, build_type)
            scores[i, j] = score
            distance[i, j] = dis

    maximize = True if criteria == 'tsed' else False
    matrix = scores if criteria == 'tsed' else distance
    *_, final_score = partial_matching_with_penalty(matrix, penalty, maximize)

    return final_score

def get_partial_score(
        output1, 
        output2, 
        arg,
        build_type: str='apted',
        criteria: str='tsed',
        penalty: float=0.01,
    ):
    """
    table:

    target |  prediction  |  score
    True   |  True        |  depends on arg
    True   |  False       |  tsed=0.0 or distance=np.infty
    False  |  True        |  tsed=0.0 or distance=np.infty
    False  |  False       |  None
    
    arg: 
     - use all: 'sel_asts', 'cond_asts', 'agg_asts', 'orderby_asts'
     - only use items from 2nd item in the list: 'subqueries'
     - boolean: 'distinct', 'limit'
    """
    assert build_type in ['apted', 'zss'], f'build_type should be either apted or zss, but got {build_type}'
    assert criteria in ['tsed', 'distance'], f'criteria should be either tsed or distance, but got {criteria}'
    assert arg in ['sel_asts', 'cond_asts', 'agg_asts', 'orderby_asts', 'subqueries', 'distinct', 'limit'], f'arg should be either sel_asts, cond_asts, agg_asts, orderby_asts, subqueries, distinct, limit, but got {arg}'
    
    if output2[arg] and output1[arg]:
        if arg in ['sel_asts', 'cond_asts', 'agg_asts', 'orderby_asts']:
            source = [ast for _, ast, _ in output1[arg]]
            target = [ast for _, ast, _ in output2[arg]]
            score = get_score(source, target, build_type, criteria, penalty)
        elif arg == 'subqueries':
            output1 = {'subqueries': output1[arg][1:]}
            output2 = {'subqueries': output2[arg][1:]}
            return get_partial_score(output1, output2, arg='subqueries', criteria=criteria, penalty=penalty)
        elif arg in ['distinct', 'limit']:
            score = 1.0 if criteria == 'tsed' else 0.0
    elif (not output2[arg]) and (not output1[arg]):
        score = 1.0 if criteria == 'tsed' else 0.0    
    else:
        # they don't exist in both so, we can't measure the score
        score = None
        # score = 0.0 if criteria == 'tsed' else np.infty
    return score

build_type = 'apted'  # apted or zss
criteria = 'tsed'  # tsed or distance
penalty = 0.01

sel_score = get_partial_score(output1, output2, arg='sel_asts', criteria=criteria, penalty=penalty)
print(f'  Selection: {criteria}={sel_score}')
cond_score = get_partial_score(output1, output2, arg='cond_asts', criteria=criteria, penalty=penalty)
print(f'  Condition: {criteria}={cond_score}')
agg_score = get_partial_score(output1, output2, arg='agg_asts', criteria=criteria, penalty=penalty)
print(f'  Aggregation: {criteria}={agg_score}')
orderby_score = get_partial_score(output1, output2, arg='orderby_asts', criteria=criteria, penalty=penalty)
print(f'  Orderby: {criteria}={orderby_score}')
nested_score = get_partial_score(output1, output2, arg='subqueries', criteria=criteria, penalty=penalty)
print(f'  Nested: {criteria}={nested_score}')
distinct_score = get_partial_score(output1, output2, arg='distinct', criteria=criteria, penalty=penalty)
print(f'  Distinct: {criteria}={distinct_score}')
limit_score = get_partial_score(output1, output2, arg='limit', criteria=criteria, penalty=penalty)
print(f'  Limit: {criteria}={limit_score}')

[SQL1]
 SELECT
  ratings.user_id
FROM ratings
INNER JOIN movies AS T2
  ON T1.movie_id = T2.movie_id
WHERE
  ratings.rating_score = [placeholder-type:numeric]
  AND ratings.rating_timestamp_utc LIKE '[placeholder-type:string]'
  AND movies.movie_title LIKE '[placeholder-type:string]'

[SQL2]
 SELECT
  ratings.user_id,
  COUNT(movies.movie_title)
FROM ratings
INNER JOIN movies AS T2
  ON T1.movie_id = T2.movie_id
GROUP BY
  ratings.user_id
HAVING
  COUNT(movies.movie_title) > [placeholder-type:numeric]
ORDER BY
  COUNT(movies.movie_title)

TSED: 0.4359
Tree Edit Distance: 22
Partial Match Score
  Selection: tsed=0.4950000047683716
  Condition: tsed=0.1600000113248825
  Aggregation: tsed=None
  Orderby: tsed=None
  Nested: tsed=1.0
  Distinct: tsed=1.0
  Limit: tsed=1.0


In [168]:
sel_score

0.495

In [48]:
import spacy
try:
    nlp_spacy = spacy.load('en_core_web_md')
except OSError:
    from spacy.cli import download
    download('en_core_web_md')

from bert_score import score as bscore

In [49]:
source_spacy = [nlp_spacy(str(x)) for x in source]
target_spacy = [nlp_spacy(str(x)) for x in target]

for s in source_spacy:
    for t in target_spacy:
        print(f'{s.similarity(t):.5f}', s, t, )

1.00000 lineitem.l_commitdate < lineitem.l_receiptdate lineitem.l_receiptdate > lineitem.l_commitdate
0.96241 lineitem.l_commitdate < lineitem.l_receiptdate STRFTIME('%Y', lineitem.l_receiptdate) = '[placeholder-type:string]'
0.98193 lineitem.l_receiptdate >= '[placeholder-type:string]' lineitem.l_receiptdate > lineitem.l_commitdate
0.99266 lineitem.l_receiptdate >= '[placeholder-type:string]' STRFTIME('%Y', lineitem.l_receiptdate) = '[placeholder-type:string]'
0.97697 lineitem.l_receiptdate < '[placeholder-type:string]' lineitem.l_receiptdate > lineitem.l_commitdate
0.99397 lineitem.l_receiptdate < '[placeholder-type:string]' STRFTIME('%Y', lineitem.l_receiptdate) = '[placeholder-type:string]'


In [50]:
from itertools import product
source_str = [str(x) for x in source]
target_str = [str(x) for x in target]
source_str_list, target_str_list = list(zip(*product(source_str, target_str)))
P, R, F1 = bscore(source_str_list, target_str_list, lang='en', verbose=False)
precision = P.numpy().reshape(len(source_str), len(target_str))
recall = R.numpy().reshape(len(source_str), len(target_str))
f1 = F1.numpy().reshape(len(source_str), len(target_str))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
f1

array([[0.9841494 , 0.868934  ],
       [0.8904283 , 0.93327105],
       [0.8900186 , 0.9319242 ]], dtype=float32)

In [None]:
from scipy.optimize import linear_sum_assignment

In [69]:
for i, s in enumerate(source_str):
    print(f'Source{i}: {s}')
    rank_f1 = f1[i, :].argsort()[::-1].argsort()
    rank_score = score[i, :].argsort()[::-1].argsort()
    for j, t in enumerate(target_str):
        print(f'  -> Target{j}: {t}')
        print(f'  BS: {f1[i, j]:.4f} ({rank_f1[j]}) | TSED: {score[i, j]:.4f} ({rank_score[j]})')

Source0: lineitem.l_commitdate < lineitem.l_receiptdate
  -> Target0: lineitem.l_receiptdate > lineitem.l_commitdate
  BS: 0.9841 (0) | TSED: 0.2857 (0)
  -> Target1: STRFTIME('%Y', lineitem.l_receiptdate) = '[placeholder-type:string]'
  BS: 0.8689 (1) | TSED: 0.1429 (1)
Source1: lineitem.l_receiptdate >= '[placeholder-type:string]'
  -> Target0: lineitem.l_receiptdate > lineitem.l_commitdate
  BS: 0.8904 (1) | TSED: 0.4286 (1)
  -> Target1: STRFTIME('%Y', lineitem.l_receiptdate) = '[placeholder-type:string]'
  BS: 0.9333 (0) | TSED: 0.5714 (0)
Source2: lineitem.l_receiptdate < '[placeholder-type:string]'
  -> Target0: lineitem.l_receiptdate > lineitem.l_commitdate
  BS: 0.8900 (1) | TSED: 0.4286 (1)
  -> Target1: STRFTIME('%Y', lineitem.l_receiptdate) = '[placeholder-type:string]'
  BS: 0.9319 (0) | TSED: 0.5714 (0)


In [145]:
distance

array([[5., 6.],
       [4., 3.],
       [4., 3.]], dtype=float32)

In [56]:
f1

array([[0.9841494 , 0.868934  ],
       [0.8904283 , 0.93327105],
       [0.8900186 , 0.9319242 ]], dtype=float32)

In [128]:
from scipy.optimize import linear_sum_assignment 

def partial_matching_with_penalty(matrix: np.ndarray, penalty: float=0.0, is_sim: bool=True):
    n, m = matrix.shape  # (# of source, # of target)
    size = max(n, m)
    score_matrix = np.full((size, size), -penalty, dtype=np.float32)
    score_matrix[:n, :m] = matrix
    row_ind, col_ind = linear_sum_assignment(score_matrix, maximize=is_sim)
    total_score = score_matrix[row_ind, col_ind].mean()
    return row_ind, col_ind, total_score

In [146]:
row_ind, col_ind, total_score = partial_matching_with_penalty(distance, penalty=0.0)
total_score

3.3333333

In [130]:
row_ind, col_ind

(array([0, 1, 2]), array([2, 1, 0]))

In [131]:
row_ind, col_ind, total_f1 = partial_matching_with_penalty(f1)
total_f1

0.6391401

In [110]:
score

array([[0.28571429, 0.14285714],
       [0.42857143, 0.57142857],
       [0.42857143, 0.57142857]])

In [None]:
row_ind, col_ind = linear_sum_assignment(score, maximize=True)
score[row_ind, col_ind]

array([0.42857143, 0.57142857])

In [124]:
f1

array([[0.9841494 , 0.868934  ],
       [0.8904283 , 0.93327105],
       [0.8900186 , 0.9319242 ]], dtype=float32)

In [123]:
row_ind, col_ind = linear_sum_assignment(f1, maximize=True)
f1[row_ind, col_ind]

array([0.9841494 , 0.93327105], dtype=float32)

In [150]:
# score = np.array(
#     [[0.1, 0.5, 0.3], 
#      [0.1, 0.7, 0.2]])

# score = np.array(
#     [[0.1, 0.5], 
#      [0.1, 0.7]])

row_ind, col_ind = linear_sum_assignment(distance, maximize=False)
distance[row_ind, col_ind], distance[row_ind, col_ind].mean()

(array([4., 3.], dtype=float32), 3.5)

In [151]:
distance

array([[5., 6.],
       [4., 3.],
       [4., 3.]], dtype=float32)

In [140]:
matrix = score
n, m = matrix.shape  # (# of source, # of target)
size = max(n, m)
score_matrix = np.full((size, size), -0.0, dtype=np.float32)
score_matrix[:n, :m] = matrix
row_ind, col_ind = linear_sum_assignment(score_matrix, maximize=True)
total_score = score_matrix[row_ind, col_ind].mean()
total_score

0.4

In [141]:
score_matrix[row_ind, col_ind]

array([0.1, 0.7], dtype=float32)

In [136]:
partial_matching_with_penalty(score, penalty=0.0)

(array([0, 1, 2]), array([2, 1, 0]), 0.33333334)

```
Source0: lineitem.l_commitdate < lineitem.l_receiptdate
  -> Target0: lineitem.l_receiptdate > lineitem.l_commitdate
  BS: 0.9841 (0) | TSED: 0.2857 (0)
  -> Target1: STRFTIME('%Y', lineitem.l_receiptdate) = '[placeholder-type:string]'
  BS: 0.8689 (1) | TSED: 0.1429 (1)
Source1: lineitem.l_receiptdate >= '[placeholder-type:string]'
  -> Target0: lineitem.l_receiptdate > lineitem.l_commitdate
  BS: 0.8904 (1) | TSED: 0.4286 (1)
  -> Target1: STRFTIME('%Y', lineitem.l_receiptdate) = '[placeholder-type:string]'
  BS: 0.9333 (0) | TSED: 0.5714 (0)
Source2: lineitem.l_receiptdate < '[placeholder-type:string]'
  -> Target0: lineitem.l_receiptdate > lineitem.l_commitdate
  BS: 0.8900 (1) | TSED: 0.4286 (1)
  -> Target1: STRFTIME('%Y', lineitem.l_receiptdate) = '[placeholder-type:string]'
  BS: 0.9319 (0) | TSED: 0.5714 (0)
```

In [141]:
exp.Identifier(
    this=expr.args['this'].name.lower(), 
    quoted=expr.args['this'].quoted   
)

Identifier(this=lineitem, quoted=False)