In [1]:
import numpy as np
import pandas as pd

import nltk
import torch
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from transformers.optimization import Adafactor, AdafactorSchedule
from transformers import EarlyStoppingCallback
from peft import get_peft_model, PeftConfig, PeftModel
from t5.dataset import load_spider_datasets, DatasetIterator
from t5.model import BaseModel, set_train_arguments
from t5.prompt_tuning import *

  from pandas.core.computation.check import NUMEXPR_INSTALLED
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
!huggingface-cli login --token 

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


In [4]:
print(torch.cuda.is_available())

True


## Model Parameters

In [5]:
model_name = "t5-base"
technique = "soft_prompt_tune"
version = 1
train_from_checkpoint = False
checkpoint = 2190

folder_name = f"{model_name}_{technique}_{version}"
train_path = f"results/{folder_name}"
model_path = train_path + f'/{folder_name}'
last_check_point = train_path + f'/checkpoint-{checkpoint}'

# model name on hugging face
hug_model_name = "RoxyRong/t5_base_soft_prompt_tune_v3"

print("train_path:", train_path)
print("model_path:", model_path)

train_path: results/t5-base_soft_prompt_tune_1
model_path: results/t5-base_soft_prompt_tune_1/t5-base_soft_prompt_tune_1


In [6]:
# soft prompt tuning param
num_tokens = 100
tokenizer = T5Tokenizer.from_pretrained(model_name)
base_model_name = "RoxyRong/t5_base_finetuned"
init_text = init_random_vocab_from_tokenizer(tokenizer=tokenizer)
size = 32128

peft_config = set_peft_config_with_init_text(init_text=init_text, num_tokens=num_tokens)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)
model = get_peft_model(model, peft_config)
model = model.to("cuda")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 

## Dataset

In [7]:
train_spider, others_spider, dev_spider = load_spider_datasets()
train_spider = train_spider.iloc[np.random.permutation(train_spider.index)].reset_index(drop=True)
others_spider = train_spider.iloc[np.random.permutation(others_spider.index)].reset_index(drop=True)

In [8]:
max_load_at_once = 100

train_data_iterator = DatasetIterator(
    df=train_spider,
    tokenizer=tokenizer,
    max_load_at_once=max_load_at_once,
)

valid_data_iterator = DatasetIterator(
    df=others_spider,
    tokenizer=tokenizer,
    max_load_at_once=max_load_at_once,
)

## Model Training

In [9]:
batch_size = 16
num_epochs = 50
learning_rate=0.3

In [10]:
optimizer = Adafactor(
        model.parameters(),
        weight_decay=1e-5,
        decay_rate=-0.8,
        scale_parameter=False, 
        relative_step=True,
        warmup_init=True, 
    )
scheduler = AdafactorSchedule(optimizer)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

args = set_train_arguments(
    train_path=train_path,
    batch_size=batch_size,
    num_epochs=num_epochs,
    learning_rate=learning_rate,
    load_best_model_at_end=True
)

In [11]:
trainer = BaseModel(
    model=model,
    hug_model_name=hug_model_name,
    train_data_iterator = train_data_iterator,
    valid_data_iterator = valid_data_iterator,
    seq2seq_train_args = args,
    optimizers=(optimizer, scheduler),
    callbacks=[early_stopping_callback]
)

In [None]:
# train from scratch
trainer.train()

# train from checkpoint
# trainer.train_from_checkpoint(last_check_point=last_check_point)

In [None]:
trainer.evaluate()

In [None]:
trainer.model_save(model_path=model_path)
trainer.model_upload()