<a href="https://colab.research.google.com/github/roxyrong/w266_project/blob/main/predicted_result.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install accelerate -U
!pip install nltk
!pip install peft

In [4]:
%ls

GPT_J_Baseline.ipynb               spider.py
README.md                          spider.zip
model_upload.ipynb                 t5_base.ipynb
predicted_result.ipynb             t5_finetune_lambdalabs.ipynb
predicted_result_lambdalabs.ipynb  t5_finetune_text_to_sql.ipynb
project_setup.ipynb                t5_soft_prompt_tuning_lambdalabs.ipynb
project_setup_lambdalabs.ipynb     t5_soft_prompt_tuning_text_to_sql.ipynb
[0m[01;34mresults[0m/                           [01;34mthird_party[0m/
[01;34mspider[0m/


In [1]:
import numpy as np
import pandas as pd
import collections
import nltk
import torch
import subprocess
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftConfig, PeftModel

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# finetuned version

tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=128)
model = AutoModelForSeq2SeqLM.from_pretrained("RoxyRong/t5_base_finetuned_15_epochs").to("cuda")
predict_result_path = f'results/predicted_result_t5_base_finetuned_15_epochs.txt'

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [4]:
# soft prompt tuning version

# tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=128)
# peft_model_id = "RoxyRong/t5_base_soft_prompt_2_10epochs"
# config = PeftConfig.from_pretrained(peft_model_id)
# model = AutoModelForSeq2SeqLM.from_pretrained("RoxyRong/t5_base_finetuned_2")
# model = PeftModel.from_pretrained(model, peft_model_id)
# model = model.to("cuda")
# predict_result_path = f'base_model/predicted_result_t5_base_soft_prompt_2_10epochs.txt'

In [5]:
# datasets
with open('spider/train_spider.json', 'r') as f:
    train_spider = pd.read_json(f)
with open('spider/train_others.json', 'r') as f:
    others_spider = pd.read_json(f)
with open('spider/dev.json', 'r') as f:
    dev_spider = pd.read_json(f)

In [6]:
# load schema for all tables
with open('spider/tables.json', 'r') as f:
    schema_df = pd.read_json(f)

In [7]:
def _get_schema_string(table_json):
    """Returns the schema serialized as a string."""
    table_id_to_column_names = collections.defaultdict(list)
    for table_id, name in table_json["column_names_original"]:
        table_id_to_column_names[table_id].append(name.lower())
        tables = table_json["table_names_original"]

    table_strings = []
    for table_id, table_name in enumerate(tables):
        column_names = table_id_to_column_names[table_id]
        table_string = " | %s : %s" % (table_name.lower(), " , ".join(column_names))
        table_strings.append(table_string)

    return "".join(table_strings)

schema_dict = {}
for idx, row in schema_df.iterrows():
    db_id = row['db_id']
    schema = _get_schema_string(row)
    schema_dict[db_id] = schema

In [8]:
# shuffle the dataset

train_spider = train_spider.iloc[np.random.permutation(train_spider.index)].reset_index(drop=True)
others_spider = train_spider.iloc[np.random.permutation(others_spider.index)].reset_index(drop=True)

In [9]:
prefix = 'translate English to SQL:'

train_spider['schema'] = train_spider['db_id'].map(schema_dict)
train_spider['prompt'] = prefix + train_spider['question'] + '\nDatabse schema is ' + train_spider['schema']
others_spider['schema'] = others_spider['db_id'].map(schema_dict)
others_spider['prompt'] = prefix + others_spider['question'] + '\nDatabse schema is ' + others_spider['schema']
dev_spider['schema'] = dev_spider['db_id'].map(schema_dict)
dev_spider['prompt'] = prefix + dev_spider['question'] + '\nDatabse schema is ' + dev_spider['schema']

In [10]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [11]:
# evaluate
max_length = 128
step = 100

for i in range(0, 1100, step):
    print(i)
    inputs = tokenizer.batch_encode_plus(
          list(dev_spider.iloc[i:i+step]['prompt']),
          max_length=max_length,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt'
      )
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    output_tokens = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=128
    )

    outputs = [tokenizer.decode(i, skip_special_tokens=True) for i in output_tokens]

    with open(predict_result_path, 'a', encoding='utf-8') as f:
        for idx, output in enumerate(outputs):
            db_id = dev_spider.iloc[idx]['db_id']
            f.write(output + '\t' + db_id + '\n')

0
100
200
300
400
500
600
700
800
900
1000


In [12]:
# evaluate results
# predict_result_path = f'base_model/predicted_result_t5_base_soft_prompt_2.txt'

eval_path = f"third_party/spider/evaluation.py"
gold = f"third_party/spider/evaluation_examples/gold_example.txt"
pred = predict_result_path
db_dir = f"spider/database"
table = f"spider/tables.json"
etype = "all"

cmd_str = f"python3 \"{eval_path}\" --gold \"{gold}\" --pred \"{pred}\" --db \"{db_dir}\" --table \"{table}\" --etype {etype} "
result = subprocess.run(cmd_str, shell=True, capture_output=True, text=True)

In [13]:
import pprint
pprint.pprint(result.stdout[-4633:])

('                     easy                 medium               '
 'hard                 extra                all                 \n'
 'count                250                  440                  '
 '174                  170                  1034                \n'
 'execution            0.604                0.386                '
 '0.316                0.141                0.387               \n'
 '\n'
 'exact match          0.644                0.368                '
 '0.287                0.124                0.381               \n'
 '\n'
 '---------------------PARTIAL MATCHING ACCURACY----------------------\n'
 'select               0.951                0.903                '
 '0.974                0.953                0.935               \n'
 'select(no AGG)       0.967                0.917                '
 '0.974                0.953                0.946               \n'
 'where                0.857                0.825                '
 '0.529                0.571         