<a href="https://colab.research.google.com/github/roxyrong/w266_project/blob/main/predicted_result.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install accelerate -U

In [2]:
# mount to Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/Github/w266_project

/content/drive/MyDrive/Github/w266_project


In [4]:
import numpy as np
import pandas as pd
import collections
import nltk
import torch
import subprocess
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [5]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("RoxyRong/t5_base_finetuned_final")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [6]:
# datasets
with open('spider/train_spider.json', 'r') as f:
    train_spider = pd.read_json(f)
with open('spider/train_others.json', 'r') as f:
    others_spider = pd.read_json(f)
with open('spider/dev.json', 'r') as f:
    dev_spider = pd.read_json(f)

In [7]:
# load schema for all tables
with open('spider/tables.json', 'r') as f:
    schema_df = pd.read_json(f)

In [8]:
def _get_schema_string(table_json):
  """Returns the schema serialized as a string."""
  table_id_to_column_names = collections.defaultdict(list)
  for table_id, name in table_json["column_names_original"]:
    table_id_to_column_names[table_id].append(name.lower())
  tables = table_json["table_names_original"]

  table_strings = []
  for table_id, table_name in enumerate(tables):
    column_names = table_id_to_column_names[table_id]
    table_string = " | %s : %s" % (table_name.lower(), " , ".join(column_names))
    table_strings.append(table_string)

  return "".join(table_strings)

schema_dict = {}
for idx, row in schema_df.iterrows():
  db_id = row['db_id']
  schema = _get_schema_string(row)
  schema_dict[db_id] = schema

In [9]:
# shuffle the dataset

train_spider = train_spider.iloc[np.random.permutation(train_spider.index)].reset_index(drop=True)
others_spider = train_spider.iloc[np.random.permutation(others_spider.index)].reset_index(drop=True)

In [10]:
prefix = 'translate English to SQL:'

train_spider['schema'] = train_spider['db_id'].map(schema_dict)
train_spider['prompt'] = prefix + train_spider['question'] + '\nDatabse schema is ' + train_spider['schema']
others_spider['schema'] = others_spider['db_id'].map(schema_dict)
others_spider['prompt'] = prefix + others_spider['question'] + '\nDatabse schema is ' + others_spider['schema']
dev_spider['schema'] = dev_spider['db_id'].map(schema_dict)
dev_spider['prompt'] = prefix + dev_spider['question'] + '\nDatabse schema is ' + dev_spider['schema']

In [None]:
# evaluate
max_length = 128

for i in range(200, 1100, 100):
  inputs = tokenizer.batch_encode_plus(
          list(dev_spider.iloc[i:i+100]['prompt']),
          max_length=max_length,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt'
      )

  output_tokens = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=128,
      num_beams=2,
      early_stopping=True
  )

  outputs = [tokenizer.decode(i, skip_special_tokens=True) for i in output_tokens]

  with open(predict_result_path, 'w') as f:
    for idx, output in enumerate(outputs):
        db_id = dev_spider.iloc[idx]['db_id']
        f.write(output + '\t' + db_id + '\n')

In [12]:
predict_result_path = f'base_model/predicted_result_beam_search.txt'

In [13]:
with open(predict_result_path, 'w') as f:
    for idx, output in enumerate(outputs):
        db_id = dev_spider.iloc[idx]['db_id']
        f.write(output + '\t' + db_id + '\n')

In [None]:
# evaluate results
eval_path = f"third_party/spider/evaluation.py"
gold = f"third_party/spider/evaluation_examples/gold_example.txt"
pred = predict_result_path
db_dir = f"spider/database"
table = f"spider/tables.json"
etype = "all"

cmd_str = f"python3 \"{eval_path}\" --gold \"{gold}\" --pred \"{pred}\" --db \"{db_dir}\" --table \"{table}\" --etype {etype} "
result = subprocess.run(cmd_str, shell=True, capture_output=True, text=True)

In [None]:
import pprint
pprint.pprint(result.stdout[-4633:])