In [1]:
import sqlparse
from sqlparse.sql import Token, TokenList
import re

In [2]:
def clean_query(tokens):
    tokens_flagged = []
    for a in tokens:
        for pattern in ['Token.Text.Whitespace*','Token.Punctuation*']:
            if re.match(pattern,str(a.ttype)):
                tokens_flagged.append(a)
    tokens_cleaned = [ a for a in tokens if a not in tokens_flagged]
    tokens_cleaned = [a for a in tokens_cleaned if type(a) not in [sqlparse.sql.Comment]]
    return tokens_cleaned

In [3]:
query1 = '''--asd
            with a as (select a_id1 as a_id, --abc
            case when aa = \'a1\' and a2=3 then 1 else 0 end aa_ind --def
            from `a123.aaa`) --pqr
        , b as (select b_id, 
            sum(bb) bb_sum 
            from `b123.bbb` 
            group by b_id) 
        select a.a_id,a.a_ind,b.b_sum 
        from a left join b 
        on a.a_id = b.b_id'''

query2 = '''select a.a_id,a.a_ind,b.b_sum 
        from a left join b 
        on a.a_id = b.b_id'''

In [4]:
parsed_query = sqlparse.parse(query1)[0]

In [5]:
parsed_query.tokens

[<Comment '--asd ...' at 0x2231C336750>,
 <CTE 'with' at 0x2231C3476A8>,
 <Whitespace ' ' at 0x2231C347708>,
 <IdentifierList 'a as (...' at 0x2231C359750>,
 <Whitespace ' ...' at 0x2231C3546A8>,
 <DML 'select' at 0x2231C354708>,
 <Whitespace ' ' at 0x2231C354768>,
 <IdentifierList 'a.a_id...' at 0x2231C359930>,
 <Whitespace ' ...' at 0x2231C354BE8>,
 <Keyword 'from' at 0x2231C354C48>,
 <Whitespace ' ' at 0x2231C354CA8>,
 <Identifier 'a' at 0x2231C3594F8>,
 <Whitespace ' ' at 0x2231C354D68>,
 <Keyword 'left j...' at 0x2231C354DC8>,
 <Whitespace ' ' at 0x2231C354E28>,
 <Identifier 'b' at 0x2231C359570>,
 <Whitespace ' ...' at 0x2231C354EE8>,
 <Keyword 'on' at 0x2231C354F48>,
 <Whitespace ' ' at 0x2231C354FA8>,
 <Comparison 'a.a_id...' at 0x2231C3596D8>]

In [6]:
q_tokens = parsed_query.tokens
q_tokens_cleaned = clean_query(q_tokens)

In [7]:
q_tokens_cleaned

[<CTE 'with' at 0x2231C3476A8>,
 <IdentifierList 'a as (...' at 0x2231C359750>,
 <DML 'select' at 0x2231C354708>,
 <IdentifierList 'a.a_id...' at 0x2231C359930>,
 <Keyword 'from' at 0x2231C354C48>,
 <Identifier 'a' at 0x2231C3594F8>,
 <Keyword 'left j...' at 0x2231C354DC8>,
 <Identifier 'b' at 0x2231C359570>,
 <Keyword 'on' at 0x2231C354F48>,
 <Comparison 'a.a_id...' at 0x2231C3596D8>]

In [8]:
[a.ttype for a in q_tokens_cleaned]

[Token.Keyword.CTE,
 None,
 Token.Keyword.DML,
 None,
 Token.Keyword,
 None,
 Token.Keyword,
 None,
 Token.Keyword,
 None]

In [9]:
kw1 = q_tokens_cleaned[0]

In [10]:
# isinstance(kw1,sqlparse.tokens.Keyword.CTE)

In [11]:
kw1.match(sqlparse.tokens.Keyword.CTE,['with'])

True

In [12]:
q_tokens_cleaned[2].match(sqlparse.tokens.Keyword.DML,['select'])

True

In [13]:
clean_query(q_tokens_cleaned[1].tokens)

[<Identifier 'a as (...' at 0x2231C336DE0>,
 <Identifier 'b as (...' at 0x2231C3592A0>]

In [14]:
clean_query(q_tokens_cleaned[1].tokens)[0],clean_query(q_tokens_cleaned[1].tokens)[0].get_name()

(<Identifier 'a as (...' at 0x2231C336DE0>, 'a')

In [15]:
clean_query(q_tokens_cleaned[1].tokens)[0].tokens[-1].tokens

[<Single '--pqr ' at 0x2231C353AC8>, <Whitespace ' ...' at 0x2231C353B28>]

In [16]:
clean_query(q_tokens_cleaned[1].tokens)[0].tokens[-1]

<Comment '--pqr ...' at 0x2231C3368B8>

In [17]:
clean_query(clean_query(q_tokens_cleaned[1].tokens)[0].tokens[-1].tokens)

[<Single '--pqr ' at 0x2231C353AC8>]

In [18]:
def get_select_from_with(subq_token):
    for t in subq_token.tokens:
        if isinstance(t,sqlparse.sql.Parenthesis):
            return clean_query(t.tokens)

In [19]:
test_select_q = get_select_from_with(clean_query(q_tokens_cleaned[1].tokens)[0])
test_select_q

[<DML 'select' at 0x2231C347948>,
 <Identifier 'a_id1 ...' at 0x2231C336C00>,
 <Identifier 'case w...' at 0x2231C3597C8>,
 <Keyword 'from' at 0x2231C3538E8>,
 <Identifier '`a123....' at 0x2231C359138>]

In [20]:
test_select_q[1].tokens,clean_query(test_select_q[1].tokens)#,[i for i in test_select_q[1].get_identifiers()]

([<Name 'a_id1' at 0x2231C347A08>,
  <Whitespace ' ' at 0x2231C347A68>,
  <Keyword 'as' at 0x2231C347AC8>,
  <Whitespace ' ' at 0x2231C347B28>,
  <Identifier 'a_id' at 0x2231C336ED0>],
 [<Name 'a_id1' at 0x2231C347A08>,
  <Keyword 'as' at 0x2231C347AC8>,
  <Identifier 'a_id' at 0x2231C336ED0>])

In [22]:
# clean_query(test_select_q[1].tokens)[0].get_real_name(),clean_query(test_select_q[1].tokens)[0].get_alias()

In [23]:
clean_query(test_select_q[1].tokens)[1]

<Keyword 'as' at 0x2231C347AC8>

In [24]:
clean_query(test_select_q[1].tokens)[1].get_alias()

AttributeError: 'Token' object has no attribute 'get_alias'

In [25]:
clean_query(test_select_q[1].tokens)[1].tokens[0].tokens

AttributeError: 'Token' object has no attribute 'tokens'

In [26]:
clean_query(test_select_q[1].tokens)[1].tokens[0].get_cases(skip_ws=True)

AttributeError: 'Token' object has no attribute 'tokens'

In [27]:
str(clean_query(test_select_q[1].tokens)[1].tokens[0].get_cases(skip_ws=True)[0][0])

AttributeError: 'Token' object has no attribute 'tokens'

In [41]:
def parse_select_q(q):
    parsed_q = sqlparse.parse(q)[0] # assumed that q has a single query
    q_tokens = parsed_q.tokens
    q_tokens_c = clean_query(q_tokens)
    
    if q_tokens_c[0].match(sqlparse.tokens.Keyword.CTE,['with']):
        # do something about query with with
        print('with')
        
        # list,dict of all subqueries
        subq_list_raw = clean_query(q_tokens_c[1].tokens)
        subq_dict = {subq.get_name():get_select_from_with(subq) for subq in subq_list_raw}
        subq_list_c = subq_dict.values()
        
        # parse individual subqueries
        print(subq_dict)
        # for q in subq_dict, parse_simple_select
        
    elif q_tokens_c[0].match(sqlparse.tokens.Keyword.DML,['select']):
        # simpler query, parse it
        print('select')
        # parse_simple_select
    else:
        print(q_tokens_c)
        raise Exception('wut')
        
def parse_simple_select(q_tokens):
    
    

In [42]:
parse_select_q(query1)

with
{'a': [<DML 'select' at 0x2231C79DAC8>, <Identifier 'a_id1 ...' at 0x2231C7A07C8>, <Identifier 'case w...' at 0x2231C6A4ED0>, <Keyword 'from' at 0x2231C7A1A68>, <Identifier '`a123....' at 0x2231C7A0C78>], 'b': [<DML 'select' at 0x2231C7A1FA8>, <IdentifierList 'b_id, ...' at 0x2231C6A4DE0>, <Keyword 'from' at 0x2231C7A3468>, <Identifier '`b123....' at 0x2231C7A0ED0>, <Keyword 'group' at 0x2231C7A35E8>, <Keyword 'by' at 0x2231C7A36A8>, <Identifier 'b_id' at 0x2231C7A0F48>]}
