In [1]:
%load_ext autoreload
%autoreload 2

In [75]:
import sys
from pathlib import Path
proj_path = Path('.').resolve()
sys.path.append(str(proj_path))

import sqlglot
import numpy as np
from sqlglot import expressions as exp
from src.parsing_sql import Schema, extract_all
from src.eval_utils import (
    partial_match, 
    compute_tsed
)

In [6]:
schema_dict = {'lists': {'user_id': 'text',
  'list_id': 'text',
  'list_title': 'text',
  'list_movie_number': 'text',
  'list_update_timestamp_utc': 'text',
  'list_creation_timestamp_utc': 'text',
  'list_followers': 'text',
  'list_url': 'text',
  'list_comments': 'text',
  'list_description': 'text',
  'list_cover_image_url': 'text',
  'list_first_image_url': 'text',
  'list_second_image_url': 'text',
  'list_third_image_url': 'text'},
 'movies': {'movie_id': 'integer',
  'movie_title': 'integer',
  'movie_release_year': 'integer',
  'movie_url': 'integer',
  'movie_title_language': 'integer',
  'movie_popularity': 'integer',
  'movie_image_url': 'integer',
  'director_id': 'integer',
  'director_name': 'integer',
  'director_url': 'integer'},
 'ratings_users': {'user_id': 'integer',
  'rating_date_utc': 'integer',
  'user_trialist': 'integer',
  'user_subscriber': 'integer',
  'user_avatar_image_url': 'integer',
  'user_cover_image_url': 'integer',
  'user_eligible_for_trial': 'integer',
  'user_has_payment_method': 'integer'},
 'lists_users': {'user_id': 'text',
  'list_id': 'text',
  'list_update_date_utc': 'text',
  'list_creation_date_utc': 'text',
  'user_trialist': 'text',
  'user_subscriber': 'text',
  'user_avatar_image_url': 'text',
  'user_cover_image_url': 'text',
  'user_eligible_for_trial': 'text',
  'user_has_payment_method': 'text'},
 'ratings': {'movie_id': 'integer',
  'rating_id': 'integer',
  'rating_url': 'integer',
  'rating_score': 'integer',
  'rating_timestamp_utc': 'integer',
  'critic': 'integer',
  'critic_likes': 'integer',
  'critic_comments': 'integer',
  'user_id': 'integer',
  'user_trialist': 'integer',
  'user_subscriber': 'integer',
  'user_eligible_for_trial': 'integer',
  'user_has_payment_method': 'integer'}}

sqls = """
SELECT movie_release_year FROM movies WHERE movie_title = 'Cops'
SELECT T1.user_id FROM ratings AS T1 INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE rating_score = 4 AND rating_timestamp_utc LIKE '2013-05-04 06:33:32' AND T2.movie_title LIKE 'Freaks'
SELECT T1.user_trialist FROM ratings AS T1 INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE T2.movie_title = 'A Way of Life' AND T1.user_id = 39115684
SELECT T2.movie_title FROM ratings AS T1 INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE T1.rating_timestamp_utc LIKE '2020%' GROUP BY T2.movie_title ORDER BY COUNT(T2.movie_title) DESC LIMIT 1
SELECT AVG(T1.rating_score), T2.director_name FROM ratings AS T1 INNER JOIN movies AS T2 ON T1.movie_id = T2.movie_id WHERE T2.movie_title = 'When Will I Be Loved'
"""
schema = Schema(schema_dict)
sqls = [s.strip() for s in sqls.strip().split('\n')]

In [16]:
from src.parsing_sql import (
    extract_aliases,
    extract_condition,
    get_subqueries,
    _extract_conditions,
    _extract_columns_from_expression,
    _determine_tag,
    _format_expression,
    _get_full_column_name,
    extract_aliases,
    extract_selection,
    extract_aggregation,
    extract_orderby,
    extract_others,
    
    _extract_aliases_from_select,
    _handle_table_or_subquery
)

In [None]:
for sql in sqls:
    parsed_sql = sqlglot.parse_one(sql)
    output = extract_all(parsed_sql, schema)
    # print
    print('SQL:', sql)
    print('# Selection')
    print(f'  unique columns: {output["sel"]}')
    for i, ast in enumerate(output['sel_asts']):
        print(f' [{i}] type: {ast[2]}')
        print(f' [{i}] ast:')
        print('  ' + repr(ast[1]))
    if output['cond_asts']:
        print('\n# condition')
        print(f'  operations: {output["op_types"]}')
        for i, ast in enumerate(output['cond_asts']):
            print(f' [{i}] {ast[0]}')
            print(f' [{i}] ast:')
            print('  ' + repr(ast[1]))
    if output['agg_asts']:
        print('\n# aggregation')
        print(f'  unique columns: {output["agg"]}')
        for i, ast in enumerate(output['agg_asts']):
            print(f' [{i}] {ast[0]}')
            print(f' [{i}] ast:')
            print('  ' + repr(ast[1]))
    if output['orderby_asts']:
        print('\n# orderby')
        print(f'  unique columns: {output["orderby"]}')
        for i, ast in enumerate(output['group_asts']):
            print(f' [{i}] {ast[0]}')
            print(f' [{i}] ast:')
            print('  ' + repr(ast[1]))
    
    if output['nested']:
        print('\n# nested')
        print(f'  number of nested: {output["nested"]}')
        # check the `output['subqueries']` if you waht to see the nested queries
        # first one is the original query
    if output['distinct']:
        print(f'\n# distinct: {output["distinct"]}')
    if output['limit']:
        print(f'\n# limit: {output["limit"]}')
    print('----------------------------------')

In [113]:
import sqlglot

sql1 = """SELECT T1.USER_ID 
FROM ratings AS T1 
INNER JOIN movies AS T2 
ON T1.movie_id = T2.movie_id 
WHERE 
    rating_score = 4 
    AND rating_timestamp_utc LIKE '2013-05-04 06:33:32' 
    AND T2.movie_title LIKE 'Freaks'
"""

sql2 = """SELECT T1.user_id, COUNT(T2.movie_title)
FROM ratings AS T1 
INNER JOIN movies AS T2 
ON T1.movie_id = T2.movie_id 
GROUP BY T1.user_id
HAVING COUNT(T2.movie_title) > 1
ORDER BY COUNT(T2.movie_title) DESC
"""

# sql1 = """SELECT
#   COUNT(*) AS late_line_items_count, (SUM(lineitem.L_EXTENDEDPRICE * lineitem.L_DISCOUNT) / 100) AS revenue
# FROM LINEITEM
# WHERE
#   lineitem.L_RECEIPTDATE > lineitem.L_COMMITDATE
#   AND STRFTIME('%Y', lineitem.L_RECEIPTDATE) = 'abcd'"""

# sql2 = """SELECT
#   COUNT(*) AS count
# FROM lineitem
# WHERE
#   lineitem.l_commitdate < lineitem.l_receiptdate
#   AND lineitem.l_receiptdate >= '1993-01-01'
#   AND lineitem.l_receiptdate < '1994-01-01'"""

sql1 = sqlglot.parse_one(sql1)  # prediction
sql2 = sqlglot.parse_one(sql2)  # target
output1 = extract_all(sql1, schema)
output2 = extract_all(sql2, schema)

print('[SQL1]\n', sql1.sql(pretty=True))
print()
print('[SQL2]\n', sql2.sql(pretty=True))
print()
tsed, distance = compute_tsed(sql1, sql2, build_type='apted')  # apted or zss
print(f'TSED: {tsed:.4f}')
print(f'Tree Edit Distance: {distance}')


# partial match
print('Partial Match Score')
sel_score = []
for sel_ast1 in output1['sel_asts']:
    for sel_ast2 in output2['sel_asts']:
        tsed, distance = compute_tsed(sel_ast1[1], sel_ast2[1], build_type='apted')
        sel_score.append(tsed)
print(f'  Selection: {np.mean(sel_score):.4f}')

if output1['cond_asts'] and output2['cond_asts']:
    # both have conditions
    cond_score = []
    for cond_ast1 in output1['cond_asts']:
        for cond_ast2 in output2['cond_asts']:
            tsed, distance = compute_tsed(cond_ast1[1], cond_ast2[1], build_type='apted')
            cond_score.append(tsed)
    print(f'  Condition: {np.mean(cond_score):.4f}')
elif output2['cond_asts']:
    # target has condition
    if not output1['cond_asts']:
        # prediction has no condition
        print(f'  Condition: {0.0}')
else:
    # both have no conditions
    print(f'  Condition: {1.0}')

if output1['agg_asts'] and output2['agg_asts']:
    # both have aggregation
    agg_score = []
    for agg_ast1 in output1['agg_asts']:
        for agg_ast2 in output2['agg_asts']:
            tsed, distance = compute_tsed(agg_ast1[1], agg_ast2[1], build_type='apted')
            agg_score.append(tsed)
    print(f'  Aggregation: {np.mean(agg_score):.4f}')
elif output2['agg_asts']:
    # target has aggregation
    if not output1['agg_asts']:
        # prediction has no aggregation
        print(f'  Aggregation: {0.0}')
else:
    # both have no aggregation
    print(f'  Aggregation: {1.0}')

if output1['orderby_asts'] and output2['orderby_asts']:
    # both have orderby
    orderby_score = []
    for orderby_ast1 in output1['orderby_asts']:
        for orderby_ast2 in output2['orderby_asts']:
            tsed, distance = compute_tsed(orderby_ast1[1], orderby_ast2[1], build_type='apted')
            orderby_score.append(tsed)
    print(f'  Orderby: {np.mean(orderby_score):.4f}')
elif output2['orderby_asts']:
    # target has orderby
    if not output1['orderby_asts']:
        # prediction has no orderby
        print(f'  Orderby: {0.0}')
else:
    # both have no orderby
    print('  Orderby: 1.0 (no orderby)')

if output1['subqueries'][1:] and output2['subqueries'][1:]:
    # both have nested queries
    nested_score = []
    for nested_ast1 in output1['subqueries'][1:]:
        for nested_ast2 in output2['subqueries'][1:]:
            tsed, distance = compute_tsed(nested_ast1, nested_ast2, build_type='apted')
            nested_score.append(tsed)
    print(f'  Nested: {np.mean(nested_score):.4f}')
elif output2['subqueries'][1:]:
    # target has nested queries
    if not output1['subqueries'][1:]:
        # prediction has no nested queries
        print(f'  Nested: {0.0}')
else:
    # both have no nested queries
    print(f'  Nested: {1.0}')

if output1['distinct'] and output2['distinct']:
    # both have distinct
    print(f'  Distinct: 1.0')
elif output2['distinct']:
    # target has distinct
    if not output1['distinct']:
        # prediction has no distinct
        print(f'  Distinct: 0.0')
else:
    # both have no distinct
    print(f'  Distinct: {1.0}')

if output1['limit'] and output2['limit']:
    # both have limit
    print(f'  Limit: 1.0')
elif output2['limit']:
    # target has limit
    if not output1['limit']:
        # prediction has no limit
        print(f'  Limit: 0.0')
else:
    # both have no limit
    print(f'  Limit: {1.0}')

[SQL1]
 SELECT
  ratings.user_id
FROM ratings AS T1
INNER JOIN movies AS T2
  ON T1.movie_id = T2.movie_id
WHERE
  ratings.rating_score = [placeholder-type:numeric]
  AND ratings.rating_timestamp_utc LIKE '[placeholder-type:string]'
  AND movies.movie_title LIKE '[placeholder-type:string]'

[SQL2]
 SELECT
  ratings.user_id,
  COUNT(movies.movie_title)
FROM ratings AS T1
INNER JOIN movies AS T2
  ON T1.movie_id = T2.movie_id
GROUP BY
  ratings.user_id
HAVING
  COUNT(movies.movie_title) > [placeholder-type:numeric]
ORDER BY
  COUNT(movies.movie_title)

TSED: 0.4634
Tree Edit Distance: 22
Partial Match Score
  Selection: 0.5000
  Condition: 0.2222
  Aggregation: 0.0
  Orderby: 0.0
  Nested: 1.0
  Distinct: 1.0
  Limit: 1.0
