# Fine tune with full scale dataset

## Import and utilities

In [1]:
import torch
import os
import sys
import wandb
import datasets
from transformers import TrainerCallback
import torch
from transformers import Trainer, TrainingArguments

In [2]:
datasets.enable_progress_bar()

In [3]:
current_dir = os.path.abspath(os.getcwd())

# Check if 'idl-project' is in the path
if 'idl-project' not in current_dir:
    raise Exception("Current directory '{current_dir}' is not within 'idl-project'")

print(f"✓ Working in '{current_dir}'")
print(f"✓ Directory contains 'idl-project'")

✓ Working in '/ocean/projects/cis250068p/iwiryadi/idl-project'
✓ Directory contains 'idl-project'


In [4]:
!git pull

Already up to date.


In [5]:
# Check CUDA
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla V100-SXM2-32GB


In [6]:
# Append paths for the src folder
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'idl-project')))

In [7]:
# Additional imports 
from src.model import load_fo_model
from src.data import load_flan_dataset
from src.utils import DEVICE, CACHE_DIR

Using device: cuda


In [8]:
OUTPUT_DIR = "output/"

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

## Load model and dataset

In [9]:
model, tokenizer = load_fo_model()

In [None]:
# without custom pad token set to zero the convergence behaves weirdly
# tokenizer.pad_token    = "<|padding|>"
# tokenizer.pad_token_id = 1     

# Use EOS token as padding token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [23]:
# Define test prompts
test_prompts = [
    "Who is Barack Obama?",
    "What is Carnegie Mellon University?",
    "Classify this restaurant review sentiment: 'The food was absolutely delicious but the service was extremely slow and the waiter seemed uninterested in helping us.'",
    "Compare and contrast Carnegie Mellon University's Computer Science and Information Systems programs in terms of research focus and career outcomes.",
    "Summarize in one sentence: Dr. Sarah Chen, lead scientist on the mission, called it 'the most significant discovery in the history of space exploration.' The finding suggests that Mars once had a much more hospitable environment with liquid water and possibly a thicker atmosphere. The agency plans to send a sample return mission within the next five years to bring these fossils back to Earth for more detailed analysis. This discovery has profound implications for our understanding of how life might develop throughout the universe."
]


for test_input_string in test_prompts:
    inputs = tokenizer(test_input_string, return_tensors="pt").to(DEVICE)
    # print(tokens[0])
    tokens = model.generate(**inputs, max_length=150, pad_token_id=tokenizer.eos_token_id)
    print("Without sampling: " + tokenizer.decode(tokens[0]))
    
    print("---------------")
    tokens = model.generate(**inputs, max_length=150, pad_token_id=tokenizer.eos_token_id, do_sample=True)
    print("With sampling   : " + tokenizer.decode(tokens[0]))
    
    print("\n===============")

Without sampling: Who is Barack Obama?

The question is, is he a man who has been in office for a long time?

The answer is, yes.

He is a man who has been in office for a long time.

He is a man who has been in office for a long time.

He is a man who has been in office for a long time.

He is a man who has been in office for a long time.

He is a man who has been in office for a long time.

He is a man who has been in office for a long time.

He is a man who has been in office for a long time.

He is a
---------------
With sampling   : Who is Barack Obama?

What we can learn from that discussion is that Obama is a much more conservative politician than Barack. As he gets down on the floor (even with some of the best comments on the debates before he was in office) we will learn that Obama is as capable as anyone of holding up the House of Representatives and the Senate.

As for Obama’s role in the Republican Party and his lack of success of making his nomination appear a legitimate c

In [24]:
batch_size = 16
dataset = load_flan_dataset().batch(batch_size=batch_size)

Resolving data files:   0%|          | 0/2167 [00:00<?, ?it/s]

Dataset loaded successfully


In [48]:
print(tokenizer.pad_token)
print(tokenizer.pad_token_id)
print(tokenizer.eos_token)
print(tokenizer.eos_token_id)

<|endoftext|>
0
<|endoftext|>
0


In [26]:
# Configure tokenizer properly 

# - https://github.com/EleutherAI/pythia/issues/156 
#   mentioned it's okay to set it to eos

# - https://huggingface.co/EleutherAI/pythia-14m/discussions/4, 
#   we can see the tokenizer pad token from tokenizer.added_tokens_decoder 
tokenizer.added_tokens_decoder

{0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50257: AddedToken("                     ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50258: AddedToken("                    ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50259: AddedToken("                   ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50260: AddedToken("           

## Check

In [27]:
examples = next(iter(dataset))
print(examples.keys())

dict_keys(['inputs', 'targets', '_template_idx', '_task_source', '_task_name', '_template_type'])


In [28]:
combined_texts = []
for inp, tgt in zip(examples['inputs'], examples['targets']):
    # Handle potential None values or empty strings
    inp_text = inp if inp is not None else ""
    tgt_text = tgt if tgt is not None else ""
    combined_texts.append(f"{inp_text}\n======\n{tgt_text}\n{tokenizer.eos_token}")

print(combined_texts[1])

Options:
- yes
- it is not possible to tell
- no
The answer is it is not possible to tell.

Q: If "A man is sitting in a brown chair with a woman sitting on his lap smiling for the camera." does that mean that "A couple are taking an engagement picture."?
Options:
- yes
- it is not possible to tell
- no
A: Two people smiling for a camera do not mean that they are engagement pictures.
The answer is it is not possible to tell.

QUESTION: Premise: "Two men yellow are fishing from a small red box."
Based on this premise, can we conclude that the hypothesis "A father and son sit on a box fishing." is true?
Options:
- yes
- it is not possible to tell
- no

Let's solve it slowly: The two men do not have to be father and son.
The answer is it is not possible to tell.

[QUESTION] Given the sentence "White greyhound racing as dog number 1." can we conclude that "A dog racing."?
Options:
- yes
- it is not possible to tell
- no
A white greyhound is a dog; that it is racing is firmly established.
T

## Data preprocessing

In [29]:
def preprocess_forward(examples):
    # Combine inputs, outputs, and eos token for each example in the batch
    combined_texts = []
    
    for inp, tgt in zip(examples['inputs'], examples['targets']):
        
        # Handle potential None values or empty strings
        inp_text = inp if inp is not None else ""
        tgt_text = tgt if tgt is not None else ""
        
        # Use separator between input and target
        combined_texts.append(f"{inp_text}\n{tgt_text}{tokenizer.eos_token}")
    
    # Tokenize the entire batch at once
    tokenized = tokenizer(
        combined_texts,
        truncation=True,
        max_length=768,
        padding="longest",
        return_tensors="pt"
    )
    
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]
    labels = input_ids.clone()
    
    # Shift labels to the left (for each sequence in the batch)
    labels[:, :-1] = input_ids[:, 1:]
    # Add padding token as the last prediction target
    # labels[:, -1] = tokenizer.pad_token_id
    labels[:, -1] = -100  


    # Mask pad tokens in labels
    labels[input_ids == tokenizer.pad_token_id] = -100
    
    return {
        "input_ids": input_ids, 
        "attention_mask": attention_mask,
        "labels": labels
    }
    
# QUESTION: Do we need to mask / weight the prompt tokens? Discussion: https://towardsdatascience.com/to-mask-or-not-to-mask-the-effect-of-prompt-tokens-on-instruction-tuning-016f85fd67f4/
# Didn't find guidance through skimming the FLAN papers 

In [30]:
preprocess_forward(examples)

{'input_ids': tensor([[  510,   637,   320,  ...,     0,     0,     0],
         [   60, 26310,  2449,  ...,     0,     0,     0],
         [   43,   991,    27,  ...,     0,     0,     0],
         ...,
         [   50,    27, 11271,  ...,    15,   432,   253],
         [ 9301, 19782,    27,  ...,     0,     0,     0],
         [19751,   275,  1984,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[  637,   320,  4645,  ...,  -100,  -100,  -100],
         [26310,  2449,    62,  ...,  -100,  -100,  -100],
         [  991,    27,  6758,  ...,  -100,  -100,  -100],
         ...,
         [   27, 11271,   486,  ...,   432,   253,  -100],
         [19782,    27, 13343,  ...,  -100,  -100,  -100],
         [  275,  1984,    27,  ...,  -100,  -100,  -1

In [31]:
preprocessed = preprocess_forward(examples)
print("input_ids shape: " + str(preprocessed['input_ids'].shape))
print("attention_mask shape: " + str(preprocessed['attention_mask'].shape))
print("labels shape: " + str(preprocessed['labels'].shape))

preprocessed

input_ids shape: torch.Size([16, 768])
attention_mask shape: torch.Size([16, 768])
labels shape: torch.Size([16, 768])


{'input_ids': tensor([[  510,   637,   320,  ...,     0,     0,     0],
         [   60, 26310,  2449,  ...,     0,     0,     0],
         [   43,   991,    27,  ...,     0,     0,     0],
         ...,
         [   50,    27, 11271,  ...,    15,   432,   253],
         [ 9301, 19782,    27,  ...,     0,     0,     0],
         [19751,   275,  1984,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[  637,   320,  4645,  ...,  -100,  -100,  -100],
         [26310,  2449,    62,  ...,  -100,  -100,  -100],
         [  991,    27,  6758,  ...,  -100,  -100,  -100],
         ...,
         [   27, 11271,   486,  ...,   432,   253,  -100],
         [19782,    27, 13343,  ...,  -100,  -100,  -100],
         [  275,  1984,    27,  ...,  -100,  -100,  -1

In [32]:
def visualize_tokenization(tokenizer, input_ids, attention_mask, labels, n):
    """
    Visualize token-level information with their corresponding labels and masks.
    
    Args:
        tokenizer: The tokenizer used
        input_ids: The token IDs
        attention_mask: The attention mask
        labels: The labels for each token
    """
    # Take the first example if we have a batch
    if len(input_ids.shape) > 1:
        input_ids = input_ids[n]
        attention_mask = attention_mask[n]
        labels = labels[n]
    
    # Decode each token individually to see them separately
    tokens = [tokenizer.decode([id]) for id in input_ids]
    
    # Create a table for visualization
    print(f"{'Index':<6} | {'Token':<20} | {'ID':<8} | {'Mask':<6} | {'Label':<8}")
    print("-" * 60)
    
    for i, (token, id, mask, label) in enumerate(zip(tokens, input_ids, attention_mask, labels)):
        # Skip padding tokens if needed
        # if id == tokenizer.pad_token_id and mask == 0:
        #     continue
            
        # Format special tokens for better readability
        token_display = token.replace('\n', '\\n').replace('\t', '\\t')
        if len(token_display) > 18:
            token_display = token_display[:15] + "..."
            
        print(f"{i:<6} | {token_display:<20} | {int(id):<8} | {int(mask):<6} | {int(label) if label != -100 else 'ignored':<8}")


print(tokenizer.decode(preprocessed['input_ids'][2], skip_special_tokens=True))
visualize_tokenization(
    tokenizer, 
    preprocessed['input_ids'], 
    preprocessed['attention_mask'], 
    preprocessed['labels'], 
    n=2
)

Jax: Which of the following sentences is nonsensical?
Options:
- Sentence A: "He hasn't bathed in a week so he's dirty"
- Sentence B: "He hasn't bathed in a week so he's clean"

Alex: Chain of thought: A person who has not showered for a week can only be dirty. The answer is Sentence B.

Jax: Pick which sentence is not logical.
Options:
- Sentence A: "The sun goes around the earth"
- Sentence B: "The earth goes around the sun"

Alex: Chain of thought: The sun does not move, the earth rotates. The answer is Sentence A.

Jax: Of the below sentences, which one does *not* make sense?
Options:
- Sentence A: "He wore watch on his wrist"
- Sentence B: "he wore watch on his nose"

Alex:
Chain of thought: A watch is not worn on one's nose. The answer is Sentence B.
Index  | Token                | ID       | Mask   | Label   
------------------------------------------------------------
0      | J                    | 43       | 1      | 991     
1      | ax                   | 991      | 1      

## Training Setup

In [34]:
# Configuration
total_examples = 378_000_000  # Total examples in FLAN dataset

small_subset   = 378_000 * 2
# small_subset   = 3_780_000

train_ratio    = 0.9
val_ratio      = 0.05
test_ratio     = 0.05

In [35]:
# try out with smaller set first
total_examples = small_subset

In [37]:
# Shuffle and split before preprocessing
shuffled   = dataset.shuffle(seed=42, buffer_size=200_000)

# Calculate split sizes
train_size = int(total_examples * train_ratio)
val_size   = int(total_examples * val_ratio)
test_size  = int(total_examples * test_ratio)


In [38]:
print(train_size, val_size, test_size)

680400 37800 37800


In [39]:
# Create splits
train_raw = shuffled.take(train_size)
remaining = shuffled.skip(train_size)
val_raw   = remaining.take(val_size)
test_raw  = remaining.skip(val_size).take(test_size)

# Preprocess each split
tokenized_train = train_raw.map(preprocess_forward, batched=True)
tokenized_val   = val_raw.map(preprocess_forward, batched=True)
tokenized_test  = test_raw.map(preprocess_forward, batched=True)

# QUESTION: How do we know what is its task composition here?

In [40]:
# Training configuration
intended_epochs      = 1
gradient_accum_steps = 8

max_steps = int(total_examples * train_ratio) // (batch_size * gradient_accum_steps) * intended_epochs

print(max_steps)

5315


In [41]:
print(f"estimated time: {max_steps / 0.35}s ({max_steps / 0.35 / 3600}) hr")


estimated time: 15185.714285714286s (4.218253968253968) hr


In [42]:
wandb.init(entity="11785_finetuning", project='ivan-testing', name="testing")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33miwiryadi[0m ([33midl-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [43]:
class GenerationTestCallback(TrainerCallback):
    """
    Callback to generate text samples at evaluation steps.
    """
    def __init__(self, tokenizer, test_prompts, device="cuda"):
        """
        Initialize with tokenizer and test prompts.
        """
        self.tokenizer = tokenizer
        self.test_prompts = test_prompts
        self.device = device
        
    def on_evaluate(self, args, state, control, model, **kwargs):
        """
        Run after each evaluation.
        """
        model.eval()  # Set model to evaluation mode
        
        print("\n" + "="*50)
        print(f"Generating samples at step {state.global_step}:")
        print("="*50)
        
        with torch.no_grad():
            for prompt in self.test_prompts:
                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
                
                # Generate text
                output_ids = model.generate(
                    inputs["input_ids"],
                    max_length=150,
                    do_sample=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )
                
                # Decode the output
                generated_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
                
                # Print the result
                print(f"\nPrompt: {prompt}")
                print(f"Generated: {generated_text}")
                print("-"*50)
                
                # Log to W&B if you're using it
                if args.report_to == "wandb":
                    import wandb
                    wandb.log({
                        f"generation/{prompt}": wandb.Html(
                            f"<b>Step {state.global_step}</b><br>"
                            f"<p><b>Prompt:</b> {prompt}</p>"
                            f"<p><b>Generated:</b> {generated_text}</p>"
                        )
                    }, step=state.global_step)
        
        return control

Some additional notes / references ;
- From [2210.11416v5.pdf](papers/2210.11416v5.pdf): We found that learning rate, batch size and the dropout were the most important hyperparameters for instruction finetuning.
- https://www.philschmid.de/fine-tune-flan-t5
- https://arxiv.org/html/2401.13586v2 
- https://github.com/TheFloatingString/stacking-llms/blob/main/pipeline-fine_tune-instruction_prompt.ipynb and https://arxiv.org/html/2410.15570v1#S3 
- https://github.com/huggingface/smol-course/blob/main/1_instruction_tuning/notebooks/sft_finetuning_example.ipynb

In [45]:
training_args = TrainingArguments(
    run_name      = "testing",
    
    output_dir    = f"{OUTPUT_DIR}/pythia-finetuned",
    eval_strategy = "steps",
    learning_rate = 1e-6, # https://openreview.net/pdf?id=3pDMYjpOxk was using 1e-6 for HHLF Anthropic
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    max_steps     = max_steps,  # Required for streaming datasets
    
    
    eval_steps    = max_steps // 5, # eval in each 20%
    save_steps    = max_steps // 2, # eval in each 50%
    logging_steps = gradient_accum_steps,
    gradient_accumulation_steps = gradient_accum_steps,
    
    save_total_limit = 2,
    
    weight_decay     = 0.01,
    fp16             = True,
    max_grad_norm    = 1.0,
    warmup_steps     = max_steps // 5, # 20% warmup
    
    logging_dir      = "./logs",
    report_to        = "wandb",
    push_to_hub      = False,
    disable_tqdm     = False,
    
    dataloader_num_workers = 4,
)

# Create the callback
generation_callback = GenerationTestCallback(tokenizer, test_prompts, device=DEVICE)

# Initialize trainer with the callback
trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = tokenized_train,
    eval_dataset  = tokenized_val,
    tokenizer     = tokenizer,
    callbacks     = [generation_callback]  # Add our callback here
)

  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [46]:
# Start training
os.environ['TOKENIZERS_PARALLELISM'] = "False"
trainer.train()

Step,Training Loss,Validation Loss


RuntimeError: DataLoader worker (pid 95850) is killed by signal: Killed. 

In [None]:
# Save final model
trainer.save_model(f"{OUTPUT_DIR}/pythia-finetuned-last-step-3")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/pythia-finetuned-last-step-3")

('output//pythia-finetuned-last-step-2/tokenizer_config.json',
 'output//pythia-finetuned-last-step-2/special_tokens_map.json',
 'output//pythia-finetuned-last-step-2/tokenizer.json')

## Test the resulting model

In [None]:
from transformers import GPTNeoXForCausalLM

model = GPTNeoXForCausalLM.from_pretrained(
    f"{OUTPUT_DIR}/pythia-finetuned-last-step-2",
    device_map=DEVICE
)

In [47]:


for test_input_string in test_prompts:
    inputs = tokenizer(test_input_string, return_tensors="pt").to(DEVICE)
    # print(tokens[0])
    tokens = model.generate(**inputs, max_length=150, pad_token_id=tokenizer.eos_token_id)
    print("Without sampling: " + tokenizer.decode(tokens[0]))
    
    print("---------------")
    tokens = model.generate(**inputs, max_length=150, pad_token_id=tokenizer.eos_token_id, do_sample=True)
    print("With sampling   : " + tokenizer.decode(tokens[0]))
    
    print("\n===============")

Without sampling: Who is Barack Obama? Is Barack Obama?n::::::::: the:::: the::::::: the:: the:: the::: the:: the::: the:: the:: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the: the
---------------
With sampling   : Who is Barack Obama? A?n?n?\x\xaq\xa/r-\\\\\\\xa0\0\a\+\\+\/\s/:\xn+\\+\\?\\\\xa-xaxa/\\xa0\xan\xan\6\2nn\nnn-\4\4n-\'\5b\2\0n\-\-i\iai\ian\\0-\\1pn\'?\-i\iti\ist\\0-\+\0\-0\\0+\\nonb\u;i\u\s a\

Without sampling: What is Carnegie Mellon University? Carnegie Mellon University is a university located in the University of Pennsylvania. in Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania Pennsylvania

1394 steps, rounghly ~190k data 


test_input_string = "Who is Barrack Obama?"
Before fine tuning: 'Who is Barrack Obama?\n\nThe Obama administration has been a major player in the Obama administration’s efforts to undermine the'
After fine tuning:  'Who is Barrack Obama? is a of the of the of the of the of the of the of the of the of the'
A                   'Who is Barrack Obama? is the is the is the? is the is the? is the is the is? the is'


test_input_string = "Carnegie Mellon University is known for"
Before fine tuning: 'Carnegie Mellon University is known for its research on the evolution of the human brain.\n\nThe Carnegie Mellon University’s research'
After fine tuning:  'Carnegie Mellon University is known for its educational, and, the,,,,,,,,,,,,,,'


test_input_string = "What is Carnegie Mellon University?"
Before fine tuning: "'What is Carnegie Mellon University?\n\nThe Carnegie Mellon University is a private, non-profit, non-profit, non'"
After fine tuning:  'What is Carnegie Mellon University? Carnegie Mellon University is a of of of of of of the of of of of of the of'
After               'What is Carnegie Mellon University? Carnegie Mellon University is a university located in the United States, the United States, and the of'
