In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
from safetensors.torch import save_file

In [2]:
def load_and_preprocess_data(train_file, validation_file, tokenizer): 

    data_files = {
        'train': train_file,
        'validation': validation_file
    }
    dataset = load_dataset('json', data_files=data_files)
    
    def preprocess_function(examples):
        max_length = 32

        inputs = examples['input']
        outputs = [str(o) for o in examples['output']]

        prompts = [f"{inp}\n" for inp in inputs]
        full_texts = [prompt + out for prompt, out in zip(prompts, outputs)]

        tokenized_full = tokenizer(full_texts, truncation=True, padding='max_length', max_length=max_length)

        tokenized_prompt = tokenizer(prompts, truncation=True, padding='max_length', max_length=max_length)

        labels = []
        for i in range(len(full_texts)):

            prompt_len = len(tokenizer.encode(prompts[i], truncation=True, max_length=max_length))
    
            label = [-100] * prompt_len + tokenized_full['input_ids'][i][prompt_len:]
       
            label = label[:max_length]
      
            if len(label) < max_length:
                label += [-100] * (max_length - len(label))
            labels.append(label)


        tokenized_full['labels'] = labels

        return tokenized_full
    

    tokenized_datasets = dataset.map(preprocess_function, batched=True)
  
    tokenized_datasets = tokenized_datasets.remove_columns(['input', 'output', 'instruction'])
    
    return tokenized_datasets

In [34]:
model_name = 'EleutherAI/pythia-1.4b-deduped'
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

scratch_cache_dir = "/mnt/fast0/rje41/.cache/huggingface"    
os.makedirs(os.path.join(scratch_cache_dir, "hub"), exist_ok=True)
os.makedirs(os.path.join(scratch_cache_dir, "datasets"), exist_ok=True)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             cache_dir=os.path.join(scratch_cache_dir, "hub")
                                            )

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
train_file = 'data/train_100.jsonl'
validation_file = 'data/test_100.jsonl'

tokenized_datasets = load_and_preprocess_data(train_file, validation_file, tokenizer)

print(tokenized_datasets['train'][:5])
print(tokenized_datasets['validation'][:5])

train_size = len(tokenized_datasets['train'])
validation_size = len(tokenized_datasets['validation'])

In [None]:
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
from peft import PeftModel
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

scratch_cache_dir = "/mnt/fast0/rje41/.cache/huggingface"    
model_path = "EleutherAI/pythia-1.4b-deduped"
os.makedirs(os.path.join(scratch_cache_dir, "hub"), exist_ok=True)
os.makedirs(os.path.join(scratch_cache_dir, "datasets"), exist_ok=True)

model = AutoModelForCausalLM.from_pretrained(model_path,
                                             cache_dir=os.path.join(scratch_cache_dir, "hub")
                                            )

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,   
    inference_mode=False,          
    r=32,  
    lora_alpha=64,  
    lora_dropout=0,  
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir='./lora_opt_results/add_sub/',    
    num_train_epochs=2,                        
    per_device_train_batch_size=8,            
    warmup_steps=50,                            
    weight_decay=0.01,                         
    logging_dir='./circuit_weighted_lora_logs',   
    logging_steps=10,              
    save_steps=28,                                
    save_strategy="steps",                       
    save_total_limit=10,                            
    fp16=True,                                     
    gradient_accumulation_steps=4,                
    report_to="none",                             
    learning_rate=3e-4,                            
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)
trainer.train()

### Set-up second task

In [35]:
from peft import PeftModel, PeftConfig
train_mul_div_file = '../dataset/mul_div/train_mul_div.jsonl'
test_mul_div_file = '../dataset/mul_div/test_mul_div.jsonl'
tokenized_datasets = load_and_preprocess_data(train_mul_div_file, test_mul_div_file, tokenizer)

train_size = len(tokenized_datasets['train'])
validation_size = len(tokenized_datasets['validation'])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [36]:
from transformers import AutoModelForCausalLM
from peft import PeftConfig, get_peft_model

# 1. Load base model
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  cache_dir=os.path.join(scratch_cache_dir, "hub"))

peft_config = PeftConfig.from_pretrained("../../add_sub/lora_opt_results/r32a64/checkpoint-282", is_trainable=True)
ad_model = get_peft_model(base_model, peft_config)
ad_model.load_adapter("../../add_sub/lora_opt_results/r32a64/checkpoint-282", adapter_name="default", is_trainable=True)
ad_model.set_adapter('default')
ad_model.print_trainable_parameters()

trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


In [37]:
training_args = TrainingArguments(
    output_dir='./lora_opt_results/mul_div/',    
    num_train_epochs=2,                        
    per_device_train_batch_size=8,            
    warmup_steps=50,                            
    weight_decay=0.01,                         
    logging_dir='./circuit_weighted_lora_logs',   
    logging_steps=10,              
    save_steps=28,                                
    save_strategy="steps",                       
    save_total_limit=10,                            
    fp16=True,                                     
    gradient_accumulation_steps=4,                
    report_to="none",                             
    learning_rate=3e-4,                            
)


trainer = Trainer(
    model=ad_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)
trainer.train()

  trainer = Trainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 23.57 GiB of which 3.50 MiB is free. Process 1665135 has 16.97 GiB memory in use. Including non-PyTorch memory, this process has 6.58 GiB memory in use. Of the allocated memory 6.27 GiB is allocated by PyTorch, and 13.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)