<a href="https://colab.research.google.com/github/roxyrong/w266_project/blob/main/predicted_result.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install accelerate -U
!pip install nltk

In [2]:
%ls

README.md                       t5_base.ipynb
model_upload.ipynb              t5_finetune_lambdalabs.ipynb
predicted_result-Copy1.ipynb    t5_finetune_text_to_sql.ipynb
predicted_result.ipynb          t5_soft_prompt_tuning_lambdalabs.ipynb
project_setup.ipynb             t5_soft_prompt_tuning_text_to_sql.ipynb
project_setup_lambdalabs.ipynb  [0m[01;34mthird_party[0m/
spider.py


In [5]:
import numpy as np
import pandas as pd
import collections
import nltk
import torch
import subprocess
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=128)
model = AutoModelForSeq2SeqLM.from_pretrained("RoxyRong/t5_base_finetuned_final").to("cuda")

In [9]:
# datasets
with open('spider/train_spider.json', 'r') as f:
    train_spider = pd.read_json(f)
with open('spider/train_others.json', 'r') as f:
    others_spider = pd.read_json(f)
with open('spider/dev.json', 'r') as f:
    dev_spider = pd.read_json(f)

In [10]:
# load schema for all tables
with open('spider/tables.json', 'r') as f:
    schema_df = pd.read_json(f)

In [11]:
def _get_schema_string(table_json):
    """Returns the schema serialized as a string."""
    table_id_to_column_names = collections.defaultdict(list)
    for table_id, name in table_json["column_names_original"]:
        table_id_to_column_names[table_id].append(name.lower())
        tables = table_json["table_names_original"]

    table_strings = []
    for table_id, table_name in enumerate(tables):
        column_names = table_id_to_column_names[table_id]
        table_string = " | %s : %s" % (table_name.lower(), " , ".join(column_names))
        table_strings.append(table_string)

    return "".join(table_strings)

schema_dict = {}
for idx, row in schema_df.iterrows():
    db_id = row['db_id']
    schema = _get_schema_string(row)
    schema_dict[db_id] = schema

In [12]:
# shuffle the dataset

train_spider = train_spider.iloc[np.random.permutation(train_spider.index)].reset_index(drop=True)
others_spider = train_spider.iloc[np.random.permutation(others_spider.index)].reset_index(drop=True)

In [13]:
prefix = 'translate English to SQL:'

train_spider['schema'] = train_spider['db_id'].map(schema_dict)
train_spider['prompt'] = prefix + train_spider['question'] + '\nDatabse schema is ' + train_spider['schema']
others_spider['schema'] = others_spider['db_id'].map(schema_dict)
others_spider['prompt'] = prefix + others_spider['question'] + '\nDatabse schema is ' + others_spider['schema']
dev_spider['schema'] = dev_spider['db_id'].map(schema_dict)
dev_spider['prompt'] = prefix + dev_spider['question'] + '\nDatabse schema is ' + dev_spider['schema']

In [None]:
# evaluate
max_length = 128
predict_result_path = f'base_model/predicted_result_beam_search_4.txt'

for i in range(0, 1100, 100):
    print(i)
    inputs = tokenizer.batch_encode_plus(
          list(dev_spider.iloc[i:i+100]['prompt']),
          max_length=max_length,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt'
      )
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    output_tokens = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=128,
      num_beams=4,
      early_stopping=True
    )

    outputs = [tokenizer.decode(i, skip_special_tokens=True) for i in output_tokens]

    with open(predict_result_path, 'a') as f:
        for idx, output in enumerate(outputs):
            db_id = dev_spider.iloc[idx]['db_id']
            f.write(output + '\t' + db_id + '\n')

0
100
200
300


In [None]:
# evaluate results
eval_path = f"third_party/spider/evaluation.py"
gold = f"third_party/spider/evaluation_examples/gold_example.txt"
pred = predict_result_path
db_dir = f"spider/database"
table = f"spider/tables.json"
etype = "all"

cmd_str = f"python3 \"{eval_path}\" --gold \"{gold}\" --pred \"{pred}\" --db \"{db_dir}\" --table \"{table}\" --etype {etype} "
result = subprocess.run(cmd_str, shell=True, capture_output=True, text=True)

In [None]:
import pprint
pprint.pprint(result.stdout[-4633:])