# Fine tune with full scale dataset

## Import and utilities

In [None]:
import torch
import os
import sys
import wandb
import datasets
from transformers import Trainer, TrainingArguments

In [None]:
datasets.enable_progress_bar()

In [25]:
current_dir = os.path.abspath(os.getcwd())

# Check if 'idl-project' is in the path
if 'idl-project' not in current_dir:
    raise Exception("Current directory '{current_dir}' is not within 'idl-project'")

print(f"✓ Working in '{current_dir}'")
print(f"✓ Directory contains 'idl-project'")

✓ Working in '/ocean/projects/cis250068p/iwiryadi/idl-project'
✓ Directory contains 'idl-project'


In [26]:
!git pull

Already up to date.


In [27]:
# Check CUDA
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla V100-SXM2-32GB


In [28]:
# Append paths for the src folder
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'idl-project')))

In [29]:
# Additional imports 

from src.model import load_fo_model
from src.data import load_flan_dataset

In [30]:
OUTPUT_DIR = "output/"

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

## Load model and dataset

In [31]:
model, tokenizer = load_fo_model()

In [32]:
batch_size = 16
dataset = load_flan_dataset().batch(batch_size=batch_size)

Resolving data files:   0%|          | 0/2167 [00:00<?, ?it/s]

Dataset loaded successfully


In [33]:
print(tokenizer.pad_token)
print(tokenizer.pad_token_id)
print(tokenizer.eos_token)

None
None
<|endoftext|>


In [34]:
# Configure tokenizer properly 

# - https://github.com/EleutherAI/pythia/issues/156 
#   mentioned it's okay to set it to eos

# - https://huggingface.co/EleutherAI/pythia-14m/discussions/4, 
#   we can see the tokenizer pad token from tokenizer.added_tokens_decoder 
tokenizer.added_tokens_decoder

{0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50257: AddedToken("                     ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50258: AddedToken("                    ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50259: AddedToken("                   ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 50260: AddedToken("           

In [35]:
# without custom pad token set to zero the convergence behaves weirdly
tokenizer.pad_token    = "<|padding|>"
tokenizer.pad_token_id = 1     

In [36]:
tokenizer.eos_token

'<|endoftext|>'

In [37]:
def preprocess_forward(example):
    # combine input, output, and eos token - (I:... Q:... A:... <eos>)
    combined_text = f"{example['inputs']}\n{example['targets']}{tokenizer.eos_token}"
    
    tokenized = tokenizer(
        combined_text,
        truncation=True,
        max_length=768,
        padding="max_length",
        return_tensors="pt"
    )
    
    input_ids = tokenized["input_ids"][0]
    labels = input_ids.clone()
    
    # shift left 
    labels[:-1] = input_ids[1:]
    
    # mask the pad and eos token
    labels[input_ids == tokenizer.pad_token_id] = -100
    labels[input_ids == tokenizer.eos_token_id] = -100
    #TODO: Do we need to mask the prompt tokens?
    
    return {"input_ids": input_ids, "labels": labels}

## Data preprocessing

In [38]:
def preprocess_forward(examples):
    # Combine inputs, outputs, and eos token for each example in the batch
    combined_texts = []
    
    for inp, tgt in zip(examples['inputs'], examples['targets']):
        # Handle potential None values or empty strings
        inp_text = inp if inp is not None else ""
        tgt_text = tgt if tgt is not None else ""
        combined_texts.append(f"{inp_text}\n{tgt_text}{tokenizer.eos_token}")
    
    # Tokenize the entire batch at once
    tokenized = tokenizer(
        combined_texts,
        truncation=True,
        max_length=768,
        padding="max_length",
        return_tensors="pt"
    )
    
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]
    labels = input_ids.clone()
    
    # Shift labels to the left (for each sequence in the batch)
    labels[:, :-1] = input_ids[:, 1:]
    # Add padding token as the last prediction target
    labels[:, -1] = tokenizer.pad_token_id
    
    # Mask pad and eos tokens in labels
    labels[input_ids == tokenizer.pad_token_id] = -100
    labels[input_ids == tokenizer.eos_token_id] = -100
    
    # You might want to mask the prompt tokens in labels too
    # This is the TODO item from your original code
    # One common approach is to find where the targets start and mask everything before
    
    return {
        "input_ids": input_ids, 
        "attention_mask": attention_mask,
        "labels": labels
    }

## Check

In [39]:
examples = next(iter(dataset))
combined_texts = []
for inp, tgt in zip(examples['inputs'], examples['targets']):
    # Handle potential None values or empty strings
    inp_text = inp if inp is not None else ""
    tgt_text = tgt if tgt is not None else ""
    combined_texts.append(f"{inp_text}\n{tgt_text}{tokenizer.eos_token}")

In [40]:
print(combined_texts.__len__())
print(combined_texts[5])

16
Explanation and answer: Everyone knows that Jews died in the Holocaust.
yes

The corresponding question: Does the following sentence make sense?
"Ashkenazi Jews died in the Holocaust."
Options:
- yes
- no
This is a simple but accurate summary of some aspects of The Wire.
yes
Is the following statement true?
"The Wire deals with law enforcement and how it affects and influences things such as ports, education, and the illegal drug trade."
Explanation and answer: The actor plays the main role in the show.
yes

The corresponding question: "David James Elliott stars in the show JAG."
Is the above claim true?
Options:
- yes
- no
Aamir khan is a indian actor. He is from bollywood.
no
Is the following statement true?
"Aamir Khan acted in many holly wood movies."
Explanation and answer: Drum is actually a musical instrument and not the thin flap of skin that is stretched tight inside the ear.
no

The corresponding question: Claim: "Drum vibrates inside the ear when sound hits it."
Is the cl

## Training Setup

In [41]:
# Configuration
total_examples = 378_000_000  # Total examples in FLAN dataset
train_ratio = 0.9
val_ratio = 0.05
test_ratio = 0.05

In [None]:
# Shuffle and split before preprocessing
shuffled = dataset.shuffle(seed=42, buffer_size=100_000)

# Calculate split sizes
train_size = int(total_examples * train_ratio)
val_size = int(total_examples * val_ratio)
test_size = int(total_examples * test_ratio)

# Create splits
train_raw = shuffled.take(train_size)
remaining = shuffled.skip(train_size)
val_raw = remaining.take(val_size)
test_raw = remaining.skip(val_size).take(test_size)

# Preprocess each split
tokenized_train = train_raw.map(preprocess_forward, batched=True)
tokenized_val = val_raw.map(preprocess_forward, batched=True)
tokenized_test = test_raw.map(preprocess_forward, batched=True)

smaller_val = val_raw.take(400)  # Make this smaller
tokenized_smaller_val = smaller_val.map(preprocess_forward, batched=True)

In [50]:
tokenized_train

IterableDataset({
    features: Unknown,
    num_shards: 2157
})

In [49]:
tokenized_smaller_val

IterableDataset({
    features: Unknown,
    num_shards: 2157
})

In [44]:
# Training configuration
intended_epochs = 5
gradient_accum_steps = 4
max_steps =int(total_examples * train_ratio) // (batch_size * gradient_accum_steps) * intended_epochs
print(max_steps)

26578125


In [None]:
wandb.init(entity="11785_finetuning", project='ivan-testing', name="testing")

In [47]:
training_args = TrainingArguments(
    output_dir=f"{OUTPUT_DIR}/pythia-finetuned",
    evaluation_strategy="steps",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    max_steps=max_steps,  # Required for streaming datasets
    
    
    eval_steps=100,
    save_steps=100,
    logging_steps=gradient_accum_steps,
    gradient_accumulation_steps=gradient_accum_steps,
    
    save_total_limit=2,
    
    weight_decay=0.01,
    fp16=True,
    
    logging_dir="./logs",
    # report_to="none",
    report_to="wandb",
    push_to_hub=False,
    disable_tqdm=False,
    
    dataloader_num_workers = 4,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_smaller_val,
    # eval_dataset=tokenized_val,
    eval_dataset=tokenized_smaller_val,
    tokenizer=tokenizer,
)

  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [48]:
# Start training
trainer.train()

# Save final model
trainer.save_model(f"{OUTPUT_DIR}/pythia-finetuned-final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/pythia-finetuned-final")

KeyboardInterrupt: 

# Reversed 

In [None]:
dataset = load_dataset("chiayewken/flan-v2", split="train", streaming=True)

model_name = "afterless/reverse-pythia-160m"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = "<pad>"
tokenizer.pad_token_id = 0

def preprocess_reverse(example):
    combined_text = f"{example['source']}\n{example['target']}"

    tokenized_output = tokenizer(
        combined_text,
        truncation=True,
        padding="max_length",
        max_length=768,
        return_tensors="pt",
    )

    input_ids = tokenized_output["input_ids"].squeeze(0)  # Remove batch dim
    reversed_input_ids = input_ids.flip(dims=[0])  # Reverse sequence

    labels = reversed_input_ids.clone()
    labels = torch.roll(labels, shifts=1, dims=0)  # Shift right by 1
    labels[0] = -100  # Ignore loss for first token

    return {
        "input_ids": reversed_input_ids,
        "labels": labels,
    }

for i, example in enumerate(dataset):
    tokenized_example = preprocess_reverse(example)
    print(f"Example {i + 1}:")
    print(f"Input IDs: {tokenized_example['input_ids']}")
    print(f"Decoded Tokens: {tokenizer.decode(tokenized_example['input_ids'], skip_special_tokens=True)}")
    print("-" * 50)

    if i == 2:  # Show only 3 examples
        break
