In [1]:
import numpy as np
import pandas as pd
import gc
import nltk
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from gpt_j.dataset import load_spider_datasets, DatasetIterator
from gpt_j.gpt_j import GPTJForCausalLM
from t5.model import BaseModel, set_train_arguments
nltk.download('punkt')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
!huggingface-cli login --token 

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


In [3]:
print(torch.cuda.is_available())

True


## Model Parameters

In [4]:
model_name = "gpt-j-6B-8bit"
technique = "fine_tuned"
version = 1
checkpoint = 2190

folder_name = f"{model_name}_{technique}_{version}"
train_path = f"results/{folder_name}"
model_path = train_path + f'/{folder_name}'
last_check_point = train_path + f'/checkpoint-{checkpoint}'

hug_model_name = "RoxyRong/gptj_base_finetuned"

print("train_path:", train_path)
print("model_path:", model_path)

train_path: results/gpt-j-6B-8bit_fine_tuned_1
model_path: results/gpt-j-6B-8bit_fine_tuned_1/gpt-j-6B-8bit_fine_tuned_1


In [5]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", padding_side="left")
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
model = GPTJForCausalLM.from_pretrained("hivemind/gpt-j-6B-8bit", low_cpu_mem_usage=True)
model = model.to("cuda")

## Dataset

In [6]:
train_spider, others_spider, dev_spider = load_spider_datasets()
train_spider = train_spider.iloc[np.random.permutation(train_spider.index)].reset_index(drop=True)
others_spider = train_spider.iloc[np.random.permutation(others_spider.index)].reset_index(drop=True)

In [7]:
orig_encoded = tokenizer.batch_encode_plus(
    [train_spider.iloc[0]['prompt']],
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    max_length=1024,
    return_tensors="pt"
)

orig_input_ids = orig_encoded['input_ids'][0]
orig_attention_mask = orig_encoded['attention_mask'][0]

target_encoded = tokenizer.batch_encode_plus(
    [train_spider.iloc[0]['query']],
    truncation=True,
    return_attention_mask=False,
    return_tensors='pt'
)

label_ids = target_encoded['input_ids'][0]

In [8]:
max_load_at_once = 10

train_data_iterator = DatasetIterator(
    df=train_spider,
    tokenizer=tokenizer,
    max_load_at_once=max_load_at_once,
    input_max_length=128, 
    output_max_length=128
)

valid_data_iterator = DatasetIterator(
    df=others_spider,
    tokenizer=tokenizer,
    max_load_at_once=max_load_at_once,
    input_max_length=128, 
    output_max_length=128
)

In [None]:
gc.collect()

## Model Training

In [9]:
batch_size = 8
num_epochs = 1
learning_rate=0.0001

In [10]:
args = set_train_arguments(
    train_path=train_path,
    batch_size=batch_size,
    num_epochs=num_epochs,
    learning_rate=learning_rate,
)

In [11]:
trainer = BaseModel(
    model=model,
    train_data_iterator = train_data_iterator,
    valid_data_iterator = valid_data_iterator,
    seq2seq_train_args = args,
)

In [None]:
# train from scratch
trainer.train()

# train from checkpoint
# trainer.train_from_checkpoint(last_check_point=last_check_point)



Epoch,Training Loss,Validation Loss


In [None]:
trainer.evaluate()

In [None]:
# trainer.model_save(model_path=model_path)
trainer.model_upload(hug_model_name=hug_model_name)