In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"  
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
from safetensors.torch import save_file
from peft import get_peft_model, LoraConfig, TaskType
from peft import PeftModel
from pathlib import Path
import csv

In [2]:
def load_and_preprocess_data(train_file, validation_file, tokenizer): 

    data_files = {
        'train': train_file,
        'validation': validation_file
    }
    dataset = load_dataset('json', data_files=data_files)
    
    def preprocess_function(examples):
        max_length = 32

        inputs = examples['input']
        outputs = [str(o) for o in examples['output']]

        prompts = [f"{inp}\n" for inp in inputs]
        full_texts = [prompt + out for prompt, out in zip(prompts, outputs)]

        tokenized_full = tokenizer(full_texts, truncation=True, padding='max_length', max_length=max_length)

        tokenized_prompt = tokenizer(prompts, truncation=True, padding='max_length', max_length=max_length)

        labels = []
        for i in range(len(full_texts)):

            prompt_len = len(tokenizer.encode(prompts[i], truncation=True, max_length=max_length))
    
            label = [-100] * prompt_len + tokenized_full['input_ids'][i][prompt_len:]
       
            label = label[:max_length]
      
            if len(label) < max_length:
                label += [-100] * (max_length - len(label))
            labels.append(label)


        tokenized_full['labels'] = labels

        return tokenized_full
    

    tokenized_datasets = dataset.map(preprocess_function, batched=True)
  
    tokenized_datasets = tokenized_datasets.remove_columns(['input', 'output', 'instruction'])
    return tokenized_datasets

In [3]:
def convert_csv_to_json(train_file_csv: str, test_file_csv: str, output_dir: str, id) -> None:
    data_train = []
    subfolder = os.path.join(output_dir, f"prompts_id_{id}")
    filename_train_jsonl = os.path.join(subfolder, "train.jsonl")
    filename_test_jsonl = os.path.join(subfolder, "test.jsonl")
    os.makedirs(subfolder, exist_ok=True)

    data_train = []
    with open(train_file_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)
        for row in reader:
            data_train.append({"instruction": "", "input": row[0], "output": row[2]})

    with open(filename_train_jsonl, 'w', encoding='utf-8') as f:
        for item in data_train:
            json_line = json.dumps(item)
            f.write(json_line + '\n')

    data_test = []
    with open(test_file_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)
        for row in reader:
            data_test.append({"instruction": "", "input": row[0], "output": row[2]})

    with open(filename_test_jsonl, 'w', encoding='utf-8') as f:
        for item in data_test:
            json_line = json.dumps(item)
            f.write(json_line + '\n')

    return filename_train_jsonl, filename_test_jsonl

In [4]:
def setup_model_peft(model_name: str, scratch_cache_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              cache_dir=os.path.join(scratch_cache_dir, "hub"))

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token   
    
    os.makedirs(os.path.join(scratch_cache_dir, "hub"), exist_ok=True)
    os.makedirs(os.path.join(scratch_cache_dir, "datasets"), exist_ok=True)
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 cache_dir=os.path.join(scratch_cache_dir, "hub")
                                                )
    
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,   
        inference_mode=False,          
        r=32,  
        lora_alpha=64,  
    )
    model = get_peft_model(model, lora_config)

    model.print_trainable_parameters()

    return model, tokenizer

In [5]:
MODEL_NAME = 'EleutherAI/pythia-1.4b-deduped'
scratch_cache_dir = "/mnt/faster0/rje41/.cache/huggingface"   
per_device_train_batch_size = 8
gradient_accumulation_steps = 4   

folder_paths = [p for p in Path('datasets_csv_no_prefix').rglob('*') if p.is_dir()]
for folder_path in folder_paths:
    train_file_csv = str(folder_path / 'train.csv')
    test_file_csv = str(folder_path / 'test.csv')
    id = folder_path.name.replace('prompts_id_','')
    print(train_file_csv)

    model, tokenizer = setup_model_peft(model_name = MODEL_NAME,
                            scratch_cache_dir = scratch_cache_dir)
    

    filename_train_jsonl, filename_test_jsonl = convert_csv_to_json(train_file_csv, test_file_csv, 'datasets_json_no_prefix', id)
    
    tokenized_datasets = load_and_preprocess_data(str(filename_train_jsonl), str(filename_test_jsonl), tokenizer)
    
    training_args = TrainingArguments(output_dir=f'./checkpoints_no_prefix/prompt_template_{id}/',
                                      per_device_train_batch_size=per_device_train_batch_size,
                                      weight_decay=0.01,
                                      logging_dir='./logs',
                                      logging_steps=10,
                                      save_steps=20, 
                                      save_strategy="steps",
                                      fp16=True,
                                      gradient_accumulation_steps=gradient_accumulation_steps,
                                      report_to="none",
                                      learning_rate=3e-4,
                                      max_steps = 80, 
                                     )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        tokenizer=tokenizer
    )
    trainer.train()

datasets_csv_no_prefix/prompts_id_0/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.1745
20,0.1561
30,0.0953
40,0.075
50,0.0633
60,0.0544
70,0.0478
80,0.0486


datasets_csv_no_prefix/prompts_id_5/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.4819
20,0.1598
30,0.108
40,0.0791
50,0.0672
60,0.0614
70,0.0541
80,0.0474


datasets_csv_no_prefix/prompts_id_8/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.8465
20,0.1345
30,0.078
40,0.0412
50,0.0184
60,0.0084
70,0.0046
80,0.0035


datasets_csv_no_prefix/prompts_id_4/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.0458
20,0.1787
30,0.157
40,0.1342
50,0.1183
60,0.0982
70,0.0924
80,0.0882


datasets_csv_no_prefix/prompts_id_9/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.0368
20,0.2062
30,0.1791
40,0.1588
50,0.1457
60,0.1345
70,0.1286
80,0.1246


datasets_csv_no_prefix/prompts_id_2/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.5454
20,0.1321
30,0.0951
40,0.0747
50,0.0695
60,0.0563
70,0.0569
80,0.0504


datasets_csv_no_prefix/prompts_id_7/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.2411
20,0.2017
30,0.1632
40,0.1376
50,0.0986
60,0.0784
70,0.0736
80,0.0641


datasets_csv_no_prefix/prompts_id_1/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.5787
20,0.112
30,0.0842
40,0.0648
50,0.0576
60,0.0479
70,0.0431
80,0.0458


datasets_csv_no_prefix/prompts_id_3/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.7415
20,0.1459
30,0.1026
40,0.0816
50,0.072
60,0.0628
70,0.0589
80,0.0549


datasets_csv_no_prefix/prompts_id_6/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.4999
20,0.1846
30,0.1576
40,0.1314
50,0.1164


OSError: [Errno 122] Disk quota exceeded

In [6]:
MODEL_NAME = 'EleutherAI/pythia-1.4b-deduped'
scratch_cache_dir = "/mnt/fast0/rje41/.cache/huggingface" 
per_device_train_batch_size = 8
gradient_accumulation_steps = 4   

folder_path = Path('../../fine-tuning/add_sub_nlp/datasets_csv/prompts_id_9')
train_file_csv = str(folder_path / 'train.csv')
test_file_csv = str(folder_path / 'test.csv')
id = folder_path.name.replace('prompts_id_','')
print(train_file_csv)

model, tokenizer = setup_model_peft(model_name = MODEL_NAME,
                        scratch_cache_dir = scratch_cache_dir)
 

filename_train_jsonl, filename_test_jsonl = convert_csv_to_json(train_file_csv, test_file_csv, 'datasets_json', id)
 
tokenized_datasets = load_and_preprocess_data(str(filename_train_jsonl), str(filename_test_jsonl), tokenizer)
 
training_args = TrainingArguments(output_dir=f'./checkpoints/prompt_template_{id}/',
                                      per_device_train_batch_size=per_device_train_batch_size,
                                      weight_decay=0.01,
                                      logging_dir='./logs',
                                      logging_steps=10,
                                      save_steps=100, 
                                      save_strategy="steps",
                                      fp16=True,
                                      gradient_accumulation_steps=gradient_accumulation_steps,
                                      report_to="none",
                                      learning_rate=3e-4,
                                      max_steps = 600, 
                                     )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)
trainer.train()

../../fine-tuning/add_sub_nlp/datasets_csv/prompts_id_9/train.csv
trainable params: 6,291,456 || all params: 1,420,939,264 || trainable%: 0.4428


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.0168
20,0.2499
30,0.1447
40,0.1309
50,0.1012
60,0.0917
70,0.0712
80,0.0717
90,0.0583
100,0.0518


TrainOutput(global_step=600, training_loss=0.1268577162642032, metrics={'train_runtime': 335.8014, 'train_samples_per_second': 57.177, 'train_steps_per_second': 1.787, 'total_flos': 4840149139587072.0, 'train_loss': 0.1268577162642032, 'epoch': 3.8256})