# Fine tune with full scale dataset

## Import and utilities

In [32]:
import torch
import os
import sys
import wandb
import datasets

from transformers import Trainer, TrainingArguments

In [33]:
datasets.enable_progress_bar()

In [34]:
current_dir = os.path.abspath(os.getcwd())

# Check if 'idl-project' is in the path
if 'idl-project' not in current_dir:
    raise Exception("Current directory '{current_dir}' is not within 'idl-project'")

print(f"✓ Working in '{current_dir}'")
print(f"✓ Directory contains 'idl-project'")

✓ Working in '/ocean/projects/cis250068p/iwiryadi/idl-project'
✓ Directory contains 'idl-project'


In [35]:
!git pull

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Already up to date.


In [36]:
# Check CUDA
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla V100-SXM2-32GB


In [37]:
# Append paths for the src folder
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'idl-project')))

In [38]:
# Additional imports 
from src.model import load_fo_model
from src.data import load_flan_dataset
from src.utils import DEVICE, CACHE_DIR

In [39]:
OUTPUT_DIR = "output/"

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

## Load model and dataset

In [40]:
model, tokenizer = load_fo_model()

In [41]:
test_input_string = "Carnegie Mellon University is known for"

inputs = tokenizer(test_input_string, return_tensors="pt").to(DEVICE)
tokens = model.generate(**inputs)
tokenizer.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Carnegie Mellon University is known for its research on the evolution of the human brain.\n\nThe Carnegie Mellon University’s research'

In [42]:
batch_size = 16
dataset = load_flan_dataset().batch(batch_size=batch_size)

Resolving data files:   0%|          | 0/2167 [00:00<?, ?it/s]

Dataset loaded successfully


In [43]:
print(tokenizer.pad_token)
print(tokenizer.pad_token_id)
print(tokenizer.eos_token)

None
None
<|endoftext|>


In [44]:
# Configure tokenizer properly 

# - https://github.com/EleutherAI/pythia/issues/156 
#   mentioned it's okay to set it to eos

# - https://huggingface.co/EleutherAI/pythia-14m/discussions/4, 
#   we can see the tokenizer pad token from tokenizer.added_tokens_decoder 
tokenizer.added_tokens_decoder

{0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50257: AddedToken("                     ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50258: AddedToken("                    ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50259: AddedToken("                   ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50260: AddedToken("           

In [45]:
# without custom pad token set to zero the convergence behaves weirdly
tokenizer.pad_token    = "<|padding|>"
tokenizer.pad_token_id = 1     

In [46]:
tokenizer.eos_token

'<|endoftext|>'

## Check

In [47]:
examples = next(iter(dataset))
print(examples.keys())

dict_keys(['inputs', 'targets', '_template_idx', '_task_source', '_task_name', '_template_type'])


In [48]:
combined_texts = []
for inp, tgt in zip(examples['inputs'], examples['targets']):
    # Handle potential None values or empty strings
    inp_text = inp if inp is not None else ""
    tgt_text = tgt if tgt is not None else ""
    combined_texts.append(f"{inp_text}\n======\n{tgt_text}\n{tokenizer.eos_token}")

print(combined_texts[1])

Options:
- yes
- it is not possible to tell
- no
The answer is it is not possible to tell.

Q: If "A man is sitting in a brown chair with a woman sitting on his lap smiling for the camera." does that mean that "A couple are taking an engagement picture."?
Options:
- yes
- it is not possible to tell
- no
A: Two people smiling for a camera do not mean that they are engagement pictures.
The answer is it is not possible to tell.

QUESTION: Premise: "Two men yellow are fishing from a small red box."
Based on this premise, can we conclude that the hypothesis "A father and son sit on a box fishing." is true?
Options:
- yes
- it is not possible to tell
- no

Let's solve it slowly: The two men do not have to be father and son.
The answer is it is not possible to tell.

[QUESTION] Given the sentence "White greyhound racing as dog number 1." can we conclude that "A dog racing."?
Options:
- yes
- it is not possible to tell
- no
A white greyhound is a dog; that it is racing is firmly established.
T

## Data preprocessing

In [49]:
def preprocess_forward(examples):
    # Combine inputs, outputs, and eos token for each example in the batch
    combined_texts = []
    
    for inp, tgt in zip(examples['inputs'], examples['targets']):
        
        # Handle potential None values or empty strings
        inp_text = inp if inp is not None else ""
        tgt_text = tgt if tgt is not None else ""
        
        # Use separator between input and target
        combined_texts.append(f"{inp_text}\n{tgt_text}{tokenizer.eos_token}")
    
    # Tokenize the entire batch at once
    tokenized = tokenizer(
        combined_texts,
        truncation=True,
        max_length=768,
        padding="longest",
        return_tensors="pt"
    )
    
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]
    labels = input_ids.clone()
    
    # Shift labels to the left (for each sequence in the batch)
    labels[:, :-1] = input_ids[:, 1:]
    # Add padding token as the last prediction target
    labels[:, -1] = tokenizer.pad_token_id
    
    # Mask pad tokens in labels
    labels[input_ids == tokenizer.pad_token_id] = -100
    
    return {
        "input_ids": input_ids, 
        "attention_mask": attention_mask,
        "labels": labels
    }
    
# QUESTION: Do we need to mask / weight the prompt tokens? Discussion: https://towardsdatascience.com/to-mask-or-not-to-mask-the-effect-of-prompt-tokens-on-instruction-tuning-016f85fd67f4/
# Didn't find guidance through skimming the FLAN papers 

In [50]:
preprocess_forward(examples)

{'input_ids': tensor([[  510,   637,   320,  ...,     1,     1,     1],
         [   60, 26310,  2449,  ...,     1,     1,     1],
         [   43,   991,    27,  ...,     1,     1,     1],
         ...,
         [   50,    27, 11271,  ...,    15,   432,   253],
         [ 9301, 19782,    27,  ...,     1,     1,     1],
         [19751,   275,  1984,  ...,     1,     1,     1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[  637,   320,  4645,  ...,  -100,  -100,  -100],
         [26310,  2449,    62,  ...,  -100,  -100,  -100],
         [  991,    27,  6758,  ...,  -100,  -100,  -100],
         ...,
         [   27, 11271,   486,  ...,   432,   253,     1],
         [19782,    27, 13343,  ...,  -100,  -100,  -100],
         [  275,  1984,    27,  ...,  -100,  -100,  -1

In [51]:
preprocessed = preprocess_forward(examples)
print("input_ids shape: " + str(preprocessed['input_ids'].shape))
print("attention_mask shape: " + str(preprocessed['attention_mask'].shape))
print("labels shape: " + str(preprocessed['labels'].shape))

preprocessed

input_ids shape: torch.Size([16, 768])
attention_mask shape: torch.Size([16, 768])
labels shape: torch.Size([16, 768])


{'input_ids': tensor([[  510,   637,   320,  ...,     1,     1,     1],
         [   60, 26310,  2449,  ...,     1,     1,     1],
         [   43,   991,    27,  ...,     1,     1,     1],
         ...,
         [   50,    27, 11271,  ...,    15,   432,   253],
         [ 9301, 19782,    27,  ...,     1,     1,     1],
         [19751,   275,  1984,  ...,     1,     1,     1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[  637,   320,  4645,  ...,  -100,  -100,  -100],
         [26310,  2449,    62,  ...,  -100,  -100,  -100],
         [  991,    27,  6758,  ...,  -100,  -100,  -100],
         ...,
         [   27, 11271,   486,  ...,   432,   253,     1],
         [19782,    27, 13343,  ...,  -100,  -100,  -100],
         [  275,  1984,    27,  ...,  -100,  -100,  -1

## Training Setup

In [52]:
# Configuration
total_examples = 378_000_000  # Total examples in FLAN dataset

small_subset   = 378_000
# small_subset   = 3_780_000

train_ratio    = 0.9
val_ratio      = 0.05
test_ratio     = 0.05

In [53]:
# try out with smaller set first
total_examples = small_subset

In [54]:
# Shuffle and split before preprocessing
shuffled   = dataset.shuffle(seed=42, buffer_size=100_000)

# Calculate split sizes
train_size = int(total_examples * train_ratio)
val_size   = int(total_examples * val_ratio)
test_size  = int(total_examples * test_ratio)


In [55]:
print(train_size, val_size, test_size)

340200 18900 18900


In [56]:
# Create splits
train_raw = shuffled.take(train_size)
remaining = shuffled.skip(train_size)
val_raw = remaining.take(val_size)
test_raw = remaining.skip(val_size).take(test_size)

# Preprocess each split
tokenized_train = train_raw.map(preprocess_forward, batched=True)
tokenized_val = val_raw.map(preprocess_forward, batched=True)
tokenized_test = test_raw.map(preprocess_forward, batched=True)

# QUESTION: How do we know what is its task composition here?

In [57]:
# Training configuration
intended_epochs = 2
gradient_accum_steps = 4

max_steps = int(total_examples * train_ratio) // (batch_size * gradient_accum_steps) * intended_epochs

print(max_steps)

10630


In [58]:
print(f"estimated time: {max_steps / 0.9}s ({max_steps / 0.9 / 3600}) hr")


estimated time: 11811.111111111111s (3.2808641975308643) hr


In [59]:
wandb.init(entity="11785_finetuning", project='ivan-testing', name="testing")

Notes;
- From [2210.11416v5.pdf](papers/2210.11416v5.pdf): We found that learning rate, batch size and the dropout were the most important hyperparameters for instruction finetuning.
- https://www.philschmid.de/fine-tune-flan-t5

In [60]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
training_args = TrainingArguments(
    run_name      = "testing",
    
    output_dir    = f"{OUTPUT_DIR}/pythia-finetuned",
    eval_strategy = "steps",
    learning_rate = 1e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    max_steps     = max_steps,  # Required for streaming datasets
    
    
    eval_steps    = max_steps // 5, # eval in each 20%
    save_steps    = max_steps // 2, # eval in each 50%
    logging_steps = gradient_accum_steps,
    gradient_accumulation_steps = gradient_accum_steps,
    
    save_total_limit = 2,
    
    weight_decay     = 0.01,
    fp16             = True,
    
    logging_dir      = "./logs",
    # report_to="none",
    report_to        = "wandb",
    push_to_hub      = False,
    disable_tqdm     = False,
    
    dataloader_num_workers = 4,
)

# Initialize trainer
trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = tokenized_train,
    eval_dataset  = tokenized_val,
    tokenizer     = tokenizer,
)

  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [62]:
# Start training
trainer.train()

KeyboardInterrupt: 

In [None]:
# Save final model
trainer.save_model(f"{OUTPUT_DIR}/pythia-finetuned-last-step")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/pythia-finetuned-last-step")

## Test the resulting model

In [None]:
from transformers import GPTNeoXForCausalLM

model = GPTNeoXForCausalLM.from_pretrained(
    f"{OUTPUT_DIR}/pythia-finetuned-last-step",
    device_map=DEVICE
)

In [None]:
inputs = tokenizer(test_input_string, return_tensors="pt").to(DEVICE)
tokens = model.generate(**inputs)
tokenizer.decode(tokens[0])