# Text2SQL Test Notebook

In [1]:
from pathlib import Path
from main import argument_parsing
from src.model import Text2SQL
# import pytorch_lightning as pl
# import numpy as np

# pl.seed_everything(np.random.randint(0, 100))
args_parser = argument_parsing(preparse=True)
args = args_parser.parse_known_args()[0]

model = Text2SQL.load_from_checkpoint(Path(args.ckpt_dir) / "epochepoch=05-val_loss=35.578-val_acc_sc=0.545-val_acc_sa=1.000-val_acc_wn=1.000-val_acc_wo=1.000.ckpt")
model.eval()
print()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.





```
  | Name                 | Type              | Params
-----------------------------------------------------------
0 | model_bert           | BertModel         | 92.2 M
1 | model_decoder        | Decoder           | 22.5 M
2 | cross_entropy        | CrossEntropyLoss  | 0
3 | binary_cross_entropy | BCEWithLogitsLoss | 0
4 | acc_sc               | Accuracy          | 0
5 | acc_sa               | Accuracy          | 0
6 | acc_wn               | Accuracy          | 0
7 | acc_wo               | Accuracy          | 0
8 | pp_wv                | Perplexity        | 0
-----------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
434.292   Total estimated model params size (MB)
```

In [2]:
def get_predict_sql(header, model, predicts):
    
    p_sc = header[predicts['sc'][0]]
    p_sa = model.dbengine.agg_ops[predicts['sa'][0]]
    where_num = predicts['wn'][0]
    predict_SQL = f"SELECT {p_sa}({p_sc}) FROM receipts WHERE "

    wcs = predicts['wc'][0]
    wos = predicts['wo'][0]
    wvs = predicts['wv'][0]
    for i, (wc, wo, wv) in enumerate(zip(wcs, wos, wvs)):
        p_wc = header[wc]
        p_wo = model.dbengine.cond_ops[wo]
        p_wv = wv.replace("[E]", "").strip()
        s = f"{p_wc} {p_wo} '{p_wv}'"
        if i == where_num-1:
            predict_SQL += s
        else:
            s += " AND "
            predict_SQL += s
    return predict_SQL


In [3]:
table = model.load_tables(Path(args.train_table_file))
header = table["receipts"]["header"]

table_id = "receipts"
res = model.dbengine.db.query(f"SELECT * FROM {table_id}").export("df")
res.loc[65:]

Unnamed: 0,index,rcept_no,reprt_code,bsns_year,corp_code,stock_code,fs_div,fs_nm,sj_div,sj_nm,account_nm,thstrm_nm,thstrm_dt,thstrm_amount,frmtrm_nm,frmtrm_dt,frmtrm_amount,bfefrmtrm_nm,bfefrmtrm_dt,bfefrmtrm_amount
65,5,20210309000744,11011,2020,126380,5930,CFS,연결재무제표,BS,재무상태표,부채총계,제 52 기,2020.12.31 현재,102287702000000,제 51 기,2019.12.31 현재,89684076000000,제 50 기,2018.12.31 현재,91604067000000
66,6,20210309000744,11011,2020,126380,5930,CFS,연결재무제표,BS,재무상태표,자본금,제 52 기,2020.12.31 현재,897514000000,제 51 기,2019.12.31 현재,897514000000,제 50 기,2018.12.31 현재,897514000000
67,7,20210309000744,11011,2020,126380,5930,CFS,연결재무제표,BS,재무상태표,이익잉여금,제 52 기,2020.12.31 현재,271068211000000,제 51 기,2019.12.31 현재,254582894000000,제 50 기,2018.12.31 현재,242698956000000
68,8,20210309000744,11011,2020,126380,5930,CFS,연결재무제표,BS,재무상태표,자본총계,제 52 기,2020.12.31 현재,275948016000000,제 51 기,2019.12.31 현재,262880421000000,제 50 기,2018.12.31 현재,247753177000000
69,9,20210309000744,11011,2020,126380,5930,CFS,연결재무제표,IS,손익계산서,매출액,제 52 기,2020.01.01 ~ 2020.12.31,236806988000000,제 51 기,2019.01.01 ~ 2019.12.31,230400881000000,제 50 기,2018.01.01 ~ 2018.12.31,243771415000000
70,10,20210309000744,11011,2020,126380,5930,CFS,연결재무제표,IS,손익계산서,영업이익,제 52 기,2020.01.01 ~ 2020.12.31,35993876000000,제 51 기,2019.01.01 ~ 2019.12.31,27768509000000,제 50 기,2018.01.01 ~ 2018.12.31,58886669000000
71,11,20210309000744,11011,2020,126380,5930,CFS,연결재무제표,IS,손익계산서,법인세차감전 순이익,제 52 기,2020.01.01 ~ 2020.12.31,36345117000000,제 51 기,2019.01.01 ~ 2019.12.31,30432189000000,제 50 기,2018.01.01 ~ 2018.12.31,61159958000000


## Example 1

In [4]:
Q = "2020년도에서 삼성전자의 법인세 차감전순 이익은 얼마니 ???"
data = [{"question": Q, "table_id": table_id}]
data

[{'question': '2020년도에서 삼성전자의 법인세 차감전순 이익은 얼마니 ???', 'table_id': 'receipts'}]

In [6]:
predicts

{'sc': [13],
 'sa': [0],
 'wn': [2],
 'wc': [[17, 16]],
 'wo': [[0, 0]],
 'wv_tkns': [[[2455, 7452, 5436, 8003]], [[2455, 7452, 5436, 8003]]],
 'wv': [('부채총계 [E]', '부채총계 [E]')]}

In [5]:
predicts = model.predict_outputs(data, table)
# ANSWER: "SELECT thstrm_amount FROM receipts WHERE account_nm = '법인세차감전 순이익' AND bsns_year = 2020"
get_predict_sql(header, model, predicts)

"SELECT (thstrm_amount) FROM receipts WHERE bfefrmtrm_nm = '부채총계' AND frmtrm_amount = '부채총계'"

In [6]:
predicts

{'sc': [13],
 'sa': [0],
 'wn': [2],
 'wc': [[17, 16]],
 'wo': [[0, 0]],
 'wv_tkns': [[[2455, 7452, 5436, 8003]], [[2455, 7452, 5436, 8003]]],
 'wv': [('부채총계 [E]', '부채총계 [E]')]}

Reason: 

- data is too simple(need diversity)
- loss function issue

TODO:

- May need execution guided decoding(+ beam search)
- Extend the dataset to all companies
- Improve Model
- Code Refactoring
- Get the original purpose: resolve the ambigious parts in questions
- Build Application with Streamlit

In [None]:
tensor_to_img_array(window_grid_mask)