In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"  
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
from safetensors.torch import save_file

In [2]:
def load_and_preprocess_data(train_file, validation_file, tokenizer): 

    data_files = {
        'train': train_file,
        'validation': validation_file
    }
    dataset = load_dataset('json', data_files=data_files)
    
    def preprocess_function(examples):
        max_length = 32

        inputs = examples['input']
        outputs = [str(o) for o in examples['output']]

        prompts = [f"{inp}\n" for inp in inputs]
        full_texts = [prompt + out for prompt, out in zip(prompts, outputs)]

        tokenized_full = tokenizer(full_texts, truncation=True, padding='max_length', max_length=max_length)

        tokenized_prompt = tokenizer(prompts, truncation=True, padding='max_length', max_length=max_length)

        labels = []
        for i in range(len(full_texts)):

            prompt_len = len(tokenizer.encode(prompts[i], truncation=True, max_length=max_length))
    
            label = [-100] * prompt_len + tokenized_full['input_ids'][i][prompt_len:]
       
            label = label[:max_length]
      
            if len(label) < max_length:
                label += [-100] * (max_length - len(label))
            labels.append(label)


        tokenized_full['labels'] = labels

        return tokenized_full
    

    tokenized_datasets = dataset.map(preprocess_function, batched=True)
  
    tokenized_datasets = tokenized_datasets.remove_columns(['input', 'output', 'instruction'])
    
    return tokenized_datasets


train_file = 'data/train_100.jsonl'
validation_file = 'data/test_100.jsonl'

model_name = 'EleutherAI/pythia-1.4b-deduped'
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenized_datasets = load_and_preprocess_data(train_file, validation_file, tokenizer)

print(tokenized_datasets['train'][:5])
print(tokenized_datasets['validation'][:5])

train_size = len(tokenized_datasets['train'])
validation_size = len(tokenized_datasets['validation'])

{'input_ids': [[5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 2233, 559, 337, 426, 187, 6903, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 6931, 559, 5922, 426, 187, 3547, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 11107, 559, 8255, 426, 187, 16989, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 1283, 428, 818, 426, 187, 883, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 5976, 559, 3387, 426, 187, 3507, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1

In [3]:
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
from peft import PeftModel
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

scratch_cache_dir = "/mnt/fast0/rje41/.cache/huggingface"    
model_path = "EleutherAI/pythia-1.4b-deduped"
os.makedirs(os.path.join(scratch_cache_dir, "hub"), exist_ok=True)
os.makedirs(os.path.join(scratch_cache_dir, "datasets"), exist_ok=True)

model = AutoModelForCausalLM.from_pretrained(model_path,
                                             cache_dir=os.path.join(scratch_cache_dir, "hub")
                                            )


lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,   
    inference_mode=False,          
    r=32,  
    lora_alpha=64,  
    lora_dropout=0,  
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


In [4]:
training_args = TrainingArguments(
    output_dir='./lora_opt_results/r32a64',    
    num_train_epochs=2,                        
    per_device_train_batch_size=8,            
    warmup_steps=50,                            
    weight_decay=0.01,                         
    logging_dir='./circuit_weighted_lora_logs',   
    logging_steps=10,              
    save_steps=28,                                
    save_strategy="steps",                       
    save_total_limit=10,                            
    fp16=True,                                     
    gradient_accumulation_steps=4,                
    report_to="none",                             
    learning_rate=3e-4,                            
)

In [5]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,17.3913
20,9.0818
30,0.319
40,0.1512
50,0.1014
60,0.0865
70,0.0667
80,0.0697
90,0.0664
100,0.0586


TrainOutput(global_step=282, training_loss=0.990096267200478, metrics={'train_runtime': 127.2127, 'train_samples_per_second': 70.748, 'train_steps_per_second': 2.217, 'total_flos': 2277360009216000.0, 'train_loss': 0.990096267200478, 'epoch': 2.0})