In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"  
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
from safetensors.torch import save_file
from peft import get_peft_model, LoraConfig, TaskType
from peft import PeftModel

In [2]:
def load_and_preprocess_data(train_file, validation_file, tokenizer): 

    data_files = {
        'train': train_file,
        'validation': validation_file
    }
    dataset = load_dataset('json', data_files=data_files)
    
    def preprocess_function(examples):
        max_length = 32

        inputs = examples['input']
        outputs = [str(o) for o in examples['output']]

        prompts = [f"{inp}\n" for inp in inputs]
        full_texts = [prompt + out for prompt, out in zip(prompts, outputs)]

        tokenized_full = tokenizer(full_texts, truncation=True, padding='max_length', max_length=max_length)

        tokenized_prompt = tokenizer(prompts, truncation=True, padding='max_length', max_length=max_length)

        labels = []
        for i in range(len(full_texts)):

            prompt_len = len(tokenizer.encode(prompts[i], truncation=True, max_length=max_length))
    
            label = [-100] * prompt_len + tokenized_full['input_ids'][i][prompt_len:]
       
            label = label[:max_length]
      
            if len(label) < max_length:
                label += [-100] * (max_length - len(label))
            labels.append(label)


        tokenized_full['labels'] = labels

        return tokenized_full
    

    tokenized_datasets = dataset.map(preprocess_function, batched=True)
  
    tokenized_datasets = tokenized_datasets.remove_columns(['input', 'output', 'instruction'])
    
    return tokenized_datasets

In [3]:
model_name = 'EleutherAI/pythia-1.4b-deduped'
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [4]:
train_file = 'dataset/add_sub_100/train.jsonl'
validation_file = 'dataset/add_sub_100/test.jsonl'

tokenized_datasets = load_and_preprocess_data(train_file, validation_file, tokenizer)

print(tokenized_datasets['train'][:5])
print(tokenized_datasets['validation'][:5])

train_size = len(tokenized_datasets['train'])
validation_size = len(tokenized_datasets['validation'])

{'input_ids': [[5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 2233, 559, 337, 426, 187, 6903, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 6931, 559, 5922, 426, 187, 3547, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 11107, 559, 8255, 426, 187, 16989, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 1283, 428, 818, 426, 187, 883, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5850, 253, 906, 273, 253, 1563, 27844, 2048, 285, 2085, 760, 253, 2457, 3662, 27, 5976, 559, 3387, 426, 187, 3507, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1

In [5]:
scratch_cache_dir = "/mnt/fast0/rje41/.cache/huggingface"    
model_path = "EleutherAI/pythia-1.4b-deduped"

os.makedirs(os.path.join(scratch_cache_dir, "hub"), exist_ok=True)
os.makedirs(os.path.join(scratch_cache_dir, "datasets"), exist_ok=True)

model = AutoModelForCausalLM.from_pretrained(model_path,
                                             cache_dir=os.path.join(scratch_cache_dir, "hub")
                                            )

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,   
    inference_mode=False,          
    r=32,  
    lora_alpha=64,  
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


In [6]:
per_device_train_batch_size = 8
gradient_accumulation_steps = 4
training_args = TrainingArguments(
                                                output_dir='./checkpoints/add_sub/',
                                                per_device_train_batch_size=per_device_train_batch_size,
                                                weight_decay=0.01,
                                                logging_dir='./logs',
                                                logging_steps=10,
                                                save_steps=30, 
                                                save_strategy="steps",
                                                fp16=True,
                                                gradient_accumulation_steps=gradient_accumulation_steps,
                                                report_to="none",
                                                learning_rate=3e-4,
                                                max_steps = 300, 
                                            )

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,5.7919
20,0.1411
30,0.1053
40,0.0893
50,0.0722
60,0.0687
70,0.0577
80,0.0554
90,0.0524
100,0.0499


TrainOutput(global_step=300, training_loss=0.2330486936867237, metrics={'train_runtime': 130.1025, 'train_samples_per_second': 73.788, 'train_steps_per_second': 2.306, 'total_flos': 2423111049805824.0, 'train_loss': 0.2330486936867237, 'epoch': 2.127886323268206})

### Set-up second task

In [8]:
model_name = 'EleutherAI/pythia-1.4b-deduped'
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

train_mul_div_file = 'dataset/mul_div/train.jsonl'
test_mul_div_file = 'dataset/mul_div/test.jsonl'
tokenized_datasets_mul_div = load_and_preprocess_data(train_mul_div_file, test_mul_div_file, tokenizer)

In [9]:
scratch_cache_dir = "/mnt/fast0/rje41/.cache/huggingface"  
model_mul_div = AutoModelForCausalLM.from_pretrained(model_name,
                                             cache_dir=os.path.join(scratch_cache_dir, "hub")
                                            )

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,   
    inference_mode=False,          
    r=32,  
    lora_alpha=64,  
)

model = get_peft_model(model_mul_div, lora_config)

model.print_trainable_parameters()

trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


In [10]:
per_device_train_batch_size = 8
gradient_accumulation_steps = 4
training_args = TrainingArguments(
                                                output_dir='./checkpoints/mul_div/',
                                                per_device_train_batch_size=per_device_train_batch_size,
                                                weight_decay=0.01,
                                                logging_dir='./logs',
                                                logging_steps=10,
                                                save_steps=30, 
                                                save_strategy="steps",
                                                fp16=True,
                                                gradient_accumulation_steps=gradient_accumulation_steps,
                                                report_to="none",
                                                learning_rate=3e-4,
                                                max_steps = 300, 
                                            )

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_mul_div['train'],
    eval_dataset=tokenized_datasets_mul_div['validation'],
    tokenizer=tokenizer
)
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,5.7617
20,0.1589
30,0.0777
40,0.0523
50,0.0517
60,0.0436
70,0.0352
80,0.0291
90,0.0256
100,0.0236


TrainOutput(global_step=300, training_loss=0.2153280605748296, metrics={'train_runtime': 134.8545, 'train_samples_per_second': 71.188, 'train_steps_per_second': 2.225, 'total_flos': 2429184009830400.0, 'train_loss': 0.2153280605748296, 'epoch': 2.4})

### Joint Training

In [4]:
train_merged_file = 'dataset/merged/train.jsonl'
test_merged_file = 'dataset/merged/test.jsonl'
merged_tokenized_datasets = load_and_preprocess_data(train_merged_file, test_merged_file, tokenizer)

Map:   0%|          | 0/8500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [5]:
from datasets import DatasetDict
shuffled_dataset_dict = DatasetDict({
    "train": merged_tokenized_datasets["train"].shuffle(seed=42),
    "validation": merged_tokenized_datasets["validation"].shuffle(seed=42)
})

In [6]:
scratch_cache_dir = "/mnt/fast0/rje41/.cache/huggingface"  
model_merge = AutoModelForCausalLM.from_pretrained(model_name,
                                             cache_dir=os.path.join(scratch_cache_dir, "hub")
                                            )

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,   
    inference_mode=False,          
    r=32,  
    lora_alpha=64,  
)

model = get_peft_model(model_merge, lora_config)

model.print_trainable_parameters()

trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


In [7]:
per_device_train_batch_size = 8
gradient_accumulation_steps = 4
training_args = TrainingArguments(
                                                output_dir='./checkpoints/merged/',
                                                per_device_train_batch_size=per_device_train_batch_size,
                                                weight_decay=0.01,
                                                logging_dir='./logs',
                                                logging_steps=10,
                                                save_steps=30, 
                                                save_strategy="steps",
                                                fp16=True,
                                                gradient_accumulation_steps=gradient_accumulation_steps,
                                                report_to="none",
                                                learning_rate=3e-4,
                                                max_steps = 300, 
                                            )

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=shuffled_dataset_dict['train'],
    eval_dataset=shuffled_dataset_dict['validation'],
    tokenizer=tokenizer
)
trainer.train()

  trainer = Trainer(


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,5.8078
20,0.1681
30,0.1047
40,0.0851
50,0.0661
60,0.073
70,0.0645
80,0.0626
90,0.0499
100,0.0487


TrainOutput(global_step=300, training_loss=0.23838873252272605, metrics={'train_runtime': 117.6636, 'train_samples_per_second': 81.588, 'train_steps_per_second': 2.55, 'total_flos': 2426147529818112.0, 'train_loss': 0.23838873252272605, 'epoch': 1.1279397930385702})