<a href="https://colab.research.google.com/github/roxyrong/w266_project/blob/main/t5_soft_prompt_tuning_text_to_sql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture 

!pip install transformers
!pip install sentencepiece
!pip install accelerate -U
!pip install datasets
!pip install nltk
!pip install peft
!pip install torch_optimizer

In [3]:
import sys
project_path = ''
sys.path.append(project_path)

In [4]:
import os
from typing import Dict, List
import subprocess
import collections
from collections import Counter
import json
import random
import numpy as np
import pandas as pd
import nltk
import torch
import torch.nn as nn
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM
from transformers import GenerationConfig
import torch_optimizer as optim
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, PeftConfig, TaskType, PeftModel, PeftType, AutoPeftModelForSeq2SeqLM

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [5]:
!huggingface-cli login --token

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


In [6]:
# for evaluation
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# datasets
with open('spider/train_spider.json', 'r') as f:
    train_spider = pd.read_json(f)
with open('spider/train_others.json', 'r') as f:
    others_spider = pd.read_json(f)
with open('spider/dev.json', 'r') as f:
    dev_spider = pd.read_json(f)

In [8]:
# load schema for all tables
with open('spider/tables.json', 'r') as f:
    schema_df = pd.read_json(f)

In [9]:
def _get_schema_string(table_json):
    """Returns the schema serialized as a string."""
    table_id_to_column_names = collections.defaultdict(list)
    for table_id, name in table_json["column_names_original"]:
        table_id_to_column_names[table_id].append(name.lower())
        tables = table_json["table_names_original"]

    table_strings = []
    for table_id, table_name in enumerate(tables):
        column_names = table_id_to_column_names[table_id]
        table_string = " | %s : %s" % (table_name.lower(), " , ".join(column_names))
        table_strings.append(table_string)

    return "".join(table_strings)

schema_dict = {}
for idx, row in schema_df.iterrows():
    db_id = row['db_id']
    schema = _get_schema_string(row)
    schema_dict[db_id] = schema

In [10]:
# shuffle the dataset

train_spider = train_spider.iloc[np.random.permutation(train_spider.index)].reset_index(drop=True)
others_spider = train_spider.iloc[np.random.permutation(others_spider.index)].reset_index(drop=True)

# Parameters

In [11]:
%%capture

# tokenizer = T5Tokenizer.from_pretrained("t5-base", max_model_length=128)
# model = AutoModelForSeq2SeqLM.from_pretrained("RoxyRong/t5_base_finetuned")
# size = 32128

peft_model_id = "RoxyRong/t5_base_soft_prompt_2"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained("RoxyRong/t5_base_finetuned")
model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = T5Tokenizer.from_pretrained("t5-base")
size = 32128

In [12]:
corpus = list(train_spider['question'])

token_counter = Counter()

for text in corpus:
    tokens = tokenizer.tokenize(text)
    token_counter.update(tokens)

most_common_tokens = token_counter.most_common(500)

In [13]:
most_common_tokens = " ".join([token for token, freq in most_common_tokens])[:512]

In [14]:
peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=100,
    inference_mode=False,
    prompt_tuning_init_text=most_common_tokens,
    tokenizer_name_or_path="t5-base",
)

In [15]:
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

NameError: name 'peft_config' is not defined

In [16]:
model_name = "t5-base"
technique = "soft-prompt-tuned"
version = 8

folder_name = f"{model_name}_{technique}_{version}"
train_path = f"results/{folder_name}"
model_path = train_path + f'/{folder_name}'
last_check_point = train_path + f'/checkpoint-1000'

print('train_path:', train_path)
print('model_path:', model_path)

train_path: results/t5-base_soft-prompt-tuned_8
model_path: results/t5-base_soft-prompt-tuned_8/t5-base_soft-prompt-tuned_8


# Model Architecture

In [17]:
# construct prompt

prefix = 'translate English to SQL:'

train_spider['schema'] = train_spider['db_id'].map(schema_dict)
train_spider['prompt'] = prefix + train_spider['question'] + '\nDatabse schema is ' + train_spider['schema']
others_spider['schema'] = others_spider['db_id'].map(schema_dict)
others_spider['prompt'] = prefix + others_spider['question'] + '\nDatabse schema is ' + others_spider['schema']
dev_spider['schema'] = dev_spider['db_id'].map(schema_dict)
dev_spider['prompt'] = prefix + dev_spider['question'] + '\nDatabse schema is ' + dev_spider['schema']

In [18]:
def preprocess_data(text_pair, tokenizer, max_length=128):
    orig_text, target_text = text_pair
    orig_encoded = tokenizer.batch_encode_plus(
        [orig_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    orig_input_ids = orig_encoded['input_ids'][0]
    orig_attention_mask = orig_encoded['attention_mask'][0]

#     orig_input_ids = torch.cat([torch.full((1,100), size),
#                                 orig_encoded['input_ids']], 1)[0]

#     orig_attention_mask = torch.cat([torch.full((1,100), 1),
#                                      orig_encoded['attention_mask']], 1)[0]

    target_encoded = tokenizer.batch_encode_plus(
        [target_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    label_ids = target_encoded['input_ids'][0]

    return {'input_ids': orig_input_ids,
            'attention_mask': orig_attention_mask,
            'labels': label_ids}

In [19]:
class TranslationDataIterator:

    def __init__(self,
                 df,
                 tokenizer,
                 max_load_at_once,
                 max_length=128,
                 shuffle=True):

        self.df = df
        self.tokenizer = tokenizer
        self.n_examples = len(df)
        self.max_load_at_once = max_load_at_once
        self.max_length = max_length
        self.shuffle = shuffle

        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

        # Load first chunk of max_load_at_once examples
        self.df_curr_loaded = self._load_next_chunk(0)
        self.curr_idx_in_load = 0

    def _load_next_chunk(self, idx):
        load_start = idx
        load_end = idx + self.max_load_at_once

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this chunk
        self.df_curr_loaded = self.df.iloc[load_start:load_end].sample(frac=1)

    def __len__(self):
        return self.n_examples

    def __getitem__(self, idx):
        if self.df_curr_loaded is None or self.curr_idx_in_load >= len(self.df_curr_loaded):
            self._load_next_chunk(idx)
            self.curr_idx_in_load = 0

        text_pair = self.df_curr_loaded[['prompt', 'query']].values.astype(str)[self.curr_idx_in_load]
        self.curr_idx_in_load += 1

        item_data = preprocess_data(
            text_pair,
            self.tokenizer,
            self.max_length
        )

        return item_data

    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

            if i == self.__len__()-1:
                self.on_epoch_end()

    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

# Model Training

In [20]:
max_length = 128
max_load_at_once = 100

train_data_iterator = TranslationDataIterator(
    df=train_spider,
    tokenizer=tokenizer,
    max_load_at_once=max_load_at_once,
    max_length=max_length
)

valid_data_iterator = TranslationDataIterator(
    df=others_spider,
    tokenizer=tokenizer,
    max_load_at_once=max_load_at_once,
    max_length=max_length
)

In [None]:
def adafactor_optimizer(model: torch.nn.Module):
    return optim.Adafactor(
        model.parameters(),
        lr=None,
        clip_threshold=1.0,
        decay_rate=-0.8,
        beta1=None,
        weight_decay=0.0,
        relative_step=False,
        scale_parameter=False,
        warmup_init=False
    )

In [29]:
batch_size = 32

args = Seq2SeqTrainingArguments(
    train_path,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    learning_rate=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_data_iterator,
    eval_dataset=valid_data_iterator,
)

In [31]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [32]:
# start from scratch
# trainer.train()

# start from a checkpoint
trainer.train(resume_from_checkpoint= last_check_point)

Loading model from results/t5-base_soft-prompt-tuned_8/checkpoint-1000.
***** Running training *****
  Num examples = 7000
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2190
  Number of trainable parameters = 153600
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 4
  Continuing training from global step 1000
  Will skip the first 4 epochs then the first 124 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/124 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss
5,0.1087,0.072612
6,0.1087,0.072592
7,0.1079,0.072581
8,0.1079,0.072536
9,0.1079,0.072524
10,0.1089,0.07251


***** Running Evaluation *****
  Num examples = 1659
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1659
  Batch size = 32
Saving model checkpoint to results/t5-base_soft-prompt-tuned_8/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1659
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1659
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1659
  Batch size = 32
Saving model checkpoint to results/t5-base_soft-prompt-tuned_8/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1659
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2190, training_loss=0.05894580126897385, metrics={'train_runtime': 1002.7204, 'train_samples_per_second': 69.81, 'train_steps_per_second': 2.184, 'total_flos': 1.06567630848e+16, 'train_loss': 0.05894580126897385, 'epoch': 10.0})

In [35]:
trainer.save_model(model_path)

Saving model checkpoint to results/t5-base_soft-prompt-tuned_8/t5-base_soft-prompt-tuned_8
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


In [36]:
model.push_to_hub("RoxyRong/t5_base_soft_prompt_2_10epochs", use_auth_token=True)

Uploading the following files to RoxyRong/t5_base_soft_prompt_2_10epochs: README.md,adapter_model.bin,adapter_config.json


CommitInfo(commit_url='https://huggingface.co/RoxyRong/t5_base_soft_prompt_2_10epochs/commit/842efae816b9f0cdb5a84199de96f4bcfd9e8333', commit_message='Upload model', commit_description='', oid='842efae816b9f0cdb5a84199de96f4bcfd9e8333', pr_url=None, pr_revision=None, pr_num=None)

# Evaluation

In [25]:
# finetune_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=128)
peft_model_id = "RoxyRong/t5_base_soft_prompt_test"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained("RoxyRong/t5_base_finetuned")
model = PeftModel.from_pretrained(model, peft_model_id)
model = model.to("cuda")

loading file spiece.model from cache at /home/ubuntu/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 1

Downloading (…)/adapter_config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--RoxyRong--t5_base_finetuned/snapshots/a8189fb6c976fccedff8421293a0390ea944a89e/config.json
Model config T5Config {
  "_name_or_path": "RoxyRong/t5_base_finetuned",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_le

Downloading adapter_model.bin:   0%|          | 0.00/615k [00:00<?, ?B/s]

In [29]:
# evaluate
max_length = 128

inputs = tokenizer.batch_encode_plus(
        list(dev_spider['prompt']),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

output_tokens = finetune_model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=128
)

outputs = [tokenizer.decode(i, skip_special_tokens=True) for i in output_tokens]

KeyboardInterrupt: 

In [None]:
with open(f'{folder_name}/predicted_result.txt', 'w') as f:
    for idx, output in enumerate(outputs):
        db_id = dev_spider.iloc[idx]['db_id']
        f.write(output + '\t' + db_id + '\n')

In [None]:
# evaluate results
eval_path = f"third_party/spider/evaluation.py"
gold = f"third_party/spider/evaluation_examples/gold_example.txt"
pred = f"{folder_name}/predicted_result.txt"
db_dir = f"spider/database"
table = f"spider/tables.json"
etype = "all"

cmd_str = f"python3 \"{eval_path}\" --gold \"{gold}\" --pred \"{pred}\" --db \"{db_dir}\" --table \"{table}\" --etype {etype} "
result = subprocess.run(cmd_str, shell=True, capture_output=True, text=True)

In [None]:
import pprint
pprint.pprint(result.stdout[-4633:])