In [1]:
import numpy as np
import pandas as pd
import torch

import nltk
import torch
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from transformers.optimization import Adafactor, AdafactorSchedule
from transformers import EarlyStoppingCallback
from peft import get_peft_model, PeftConfig, PeftModel, PromptEmbedding
from t5.dataset import load_spider_datasets, DatasetIterator
from t5.model import BaseModel, set_train_arguments
from t5.prompt_tuning import *

  from pandas.core.computation.check import NUMEXPR_INSTALLED
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
!huggingface-cli login --token

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


In [4]:
print(torch.cuda.is_available())

True


## Dataset

In [5]:
train_spider, others_spider, dev_spider = load_spider_datasets()
train_spider = train_spider.iloc[np.random.permutation(train_spider.index)].reset_index(drop=True)
others_spider = train_spider.iloc[np.random.permutation(others_spider.index)].reset_index(drop=True)

## Model Parameters

In [6]:
model_name = "t5-base"
technique = "soft_prompt_tune"
version = 3
checkpoint = 438

folder_name = f"{model_name}_{technique}_{version}"
train_path = f"results/{folder_name}"
model_path = train_path + f'/{folder_name}'
last_check_point = train_path + f'/checkpoint-{checkpoint}'

# model name on hugging face
hug_model_name = "RoxyRong/t5_base_soft_prompt_label_embedding"

print("train_path:", train_path)
print("model_path:", model_path)

train_path: results/t5-base_soft_prompt_tune_3
model_path: results/t5-base_soft_prompt_tune_3/t5-base_soft_prompt_tune_3


In [7]:
# soft prompt tuning param
num_tokens = 100

base_model_name = "RoxyRong/t5_base_finetuned"

tokenizer = T5Tokenizer.from_pretrained(model_name, model_max_length=512, legacy=False)

# init_text = init_random_vocab_from_tokenizer(tokenizer=tokenizer)
# init_embedding = init_label_embedding(train_spider, tokenizer)

peft_config = set_peft_config_with_random_init(num_tokens=num_tokens)
# peft_config = set_peft_config_with_init_text(init_text=init_text, num_tokens=num_tokens)
# peft_config = set_peft_config_with_embedding(init_embedding=init_embedding, num_tokens=num_tokens) 
# peft_config = PeftConfig.from_pretrained(last_check_point)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)
model = get_peft_model(model, peft_config)

In [8]:
# print(model.print_trainable_parameters())
print(PromptEmbedding(peft_config, model.shared).embedding.weight)

Parameter containing:
tensor([[-1.1505,  1.1346,  0.7719,  ...,  1.2266, -0.7035, -0.1980],
        [-0.1068,  1.5342, -0.5477,  ..., -0.1487, -0.8666, -0.6342],
        [-0.2080, -1.7353, -0.3698,  ...,  1.4854, -0.4342,  0.8234],
        ...,
        [ 0.7859,  0.3043,  1.5054,  ..., -0.3675,  1.5332,  0.0957],
        [ 0.4412, -0.2100,  1.3893,  ..., -0.4516, -1.1279, -0.1571],
        [ 0.5033, -0.2446, -0.5827,  ..., -0.4625,  0.8803,  0.4667]],
       requires_grad=True)


## Model Training

In [9]:
max_load_at_once = 100

train_data_iterator = DatasetIterator(
    df=train_spider,
    tokenizer=tokenizer,
    max_load_at_once=max_load_at_once,
)

valid_data_iterator = DatasetIterator(
    df=others_spider,
    tokenizer=tokenizer,
    max_load_at_once=max_load_at_once,
)

In [10]:
batch_size = 16
num_epochs = 5
learning_rate=0.3

In [11]:
optimizer = Adafactor(
        model.parameters(),
        weight_decay=1e-5,
        decay_rate=-0.8,
        scale_parameter=False, 
        relative_step=True,
        warmup_init=True, 
    )
scheduler = AdafactorSchedule(optimizer)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

args = set_train_arguments(
    train_path=train_path,
    batch_size=batch_size,
    num_epochs=num_epochs,
    learning_rate=learning_rate,
    load_best_model_at_end=True
)

In [12]:
trainer = BaseModel(
    model=model,
    train_data_iterator = train_data_iterator,
    valid_data_iterator = valid_data_iterator,
    seq2seq_train_args = args,
    optimizers=(optimizer, scheduler),
    callbacks=[early_stopping_callback]
)

In [73]:
# train from scratch
trainer.train()

# train from checkpoint
# trainer.train_from_checkpoint(last_check_point=last_check_point)

Epoch,Training Loss,Validation Loss
1,No log,0.00615


In [48]:
trainer.evaluate()

In [49]:
trainer.model_save(model_path=model_path)
trainer.model_upload(hug_model_name=hug_model_name)