In [1]:
import sys 
import os
# Append paths for the src folder
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'idl-project')))

In [2]:
# Additional imports 
from src.model import load_fo_model
from src.data import load_flan_dataset
from src.utils import DEVICE, CACHE_DIR

Using device: cuda


In [3]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import wandb

# SCALING to LARGER FLAN

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import torch
from datasets import load_dataset

# Configuration
batch_size = 8
model_name = "EleutherAI/pythia-160m"
total_examples = 378_000_000  # Total examples in FLAN dataset
train_ratio = 0.9
val_ratio = 0.05
test_ratio = 0.05

# Load dataset and model
dataset = load_dataset("Open-Orca/FLAN", split="train", streaming=True)
model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configure tokenizer properly
tokenizer.pad_token = tokenizer.eos_token

def preprocess_forward(example):
    combined_text = f"{example['inputs']}\n{example['targets']}{tokenizer.eos_token}"
    tokenized = tokenizer(
        combined_text,
        truncation=True,
        max_length=768,
        padding="max_length",
        return_tensors="pt"
    )
    input_ids = tokenized["input_ids"][0]
    labels = input_ids.clone()
    labels[:-1] = input_ids[1:]
    labels[input_ids == tokenizer.pad_token_id] = -100
    return {"input_ids": input_ids, "labels": labels}



Resolving data files:   0%|          | 0/2167 [00:00<?, ?it/s]

In [5]:
# Define test prompts
test_prompts = [
    "Who is Barack Obama?",
    "What is Carnegie Mellon University?",
    "Classify this restaurant review sentiment: 'The food was absolutely delicious but the service was extremely slow and the waiter seemed uninterested in helping us.'",
    "Compare and contrast Carnegie Mellon University's Computer Science and Information Systems programs in terms of research focus and career outcomes.",
    "Summarize in one sentence: Dr. Sarah Chen, lead scientist on the mission, called it 'the most significant discovery in the history of space exploration.' The finding suggests that Mars once had a much more hospitable environment with liquid water and possibly a thicker atmosphere. The agency plans to send a sample return mission within the next five years to bring these fossils back to Earth for more detailed analysis. This discovery has profound implications for our understanding of how life might develop throughout the universe."
]


for test_input_string in test_prompts:
    inputs = tokenizer(test_input_string, return_tensors="pt").to(DEVICE)
    # print(tokens[0])
    tokens = model.generate(**inputs, max_length=150, pad_token_id=tokenizer.eos_token_id)
    print("Without sampling: " + tokenizer.decode(tokens[0]))
    
    print("---------------")
    tokens = model.generate(**inputs, max_length=150, pad_token_id=tokenizer.eos_token_id, do_sample=True)
    print("With sampling   : " + tokenizer.decode(tokens[0]))
    
    print("\n===============")

Without sampling: Who is Barack Obama?

The president is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the people. He is a man of the
---------------
With sampling   : Who is Barack Obama? I didn't care about him. Here's the thing, he was all sorts of crap. He's about as bad or worse as this piece goes. Here's the fact that we're all going to hear all of this and say, "Wow, this is actually great."

"It's really not," the president has said. "We believe in Obama." But it's not like he ever got to say any more on issues like this before. If you get a president who is willing to say everything, it us

In [6]:

# Shuffle and split before preprocessing
shuffled = dataset.shuffle(seed=42, buffer_size=100_000)

# Calculate split sizes
train_size = int(total_examples * train_ratio)
val_size = int(total_examples * val_ratio)
test_size = int(total_examples * test_ratio)

# Create splits
train_raw = shuffled.take(train_size)
remaining = shuffled.skip(train_size)
val_raw = remaining.take(val_size)
test_raw = remaining.skip(val_size).take(test_size)

# Preprocess each split
tokenized_train = train_raw.map(preprocess_forward, batched=False)
tokenized_val = val_raw.map(preprocess_forward, batched=False)
tokenized_test = test_raw.map(preprocess_forward, batched=False)

# Training configuration
training_args = TrainingArguments(
    output_dir="./pythia-finetuned",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    max_steps=10_000,  # Required for streaming datasets
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=1000,
    logging_dir="./logs",
    logging_steps=100,
    gradient_accumulation_steps=2,
    fp16=True,
    report_to="none",
    push_to_hub=False
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Save final model
trainer.save_model("./pythia-finetuned-final")
tokenizer.save_pretrained("./pythia-finetuned-final")

  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [7]:
for test_input_string in test_prompts:
    inputs = tokenizer(test_input_string, return_tensors="pt").to(DEVICE)
    # print(tokens[0])
    tokens = model.generate(**inputs, max_length=150, pad_token_id=tokenizer.eos_token_id)
    print("Without sampling: " + tokenizer.decode(tokens[0]))
    
    print("---------------")
    tokens = model.generate(**inputs, max_length=150, pad_token_id=tokenizer.eos_token_id, do_sample=True)
    print("With sampling   : " + tokenizer.decode(tokens[0]))
    
    print("\n===============")

Without sampling: Who is Barack Obama?-

 is- is- is- is- is- a- is- a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is is is is is is is is is is is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is a is is is a is is a
---------------
With sampling   : Who is Barack Obama?–<|endoftext|>

Without sampling: What is Carnegie Mellon University?- Carnegie Mell University- is university the of university the of. is university the of is university the of is

 a of is
<|endoftext|>
---------------
With sampling   : What is Carnegie Mellon University?-aney-ul. is university is university is on campus of Mell
 is community college is college the at ofacell community atellin
-----END:ADDRESS
:ame ""
: is is of country is ,razil origin- is ,ox origin andox
 is citynsylvaniaork country     is states states k     is     is- is