In [1]:
# %pip install -q -U bitsandbytes
# %pip install -q -U git+https://github.com/huggingface/transformers.git
# %pip install -q -U git+https://github.com/huggingface/peft.git
# %pip install -q -U git+https://github.com/huggingface/accelerate.git
# %pip install -q -U trl
# %pip install -q -U scipy
# %pip install ipywidgets

In [2]:
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch import cuda
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from datasets import Dataset, load_dataset

In [3]:
torch.cuda.empty_cache()

In [4]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [5]:
print(torch.__version__)

2.1.2+cu121


In [6]:
base_model = "berkeley-nest/Starling-LM-7B-alpha"
dataset_path = "fine_tune_data/arc_aug_train.json"
fine_tuned_model = "finetuned-models/Starling-LM-7B-alpha-finetuned"

In [7]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    #inference_mode=False,
    #target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [9]:
df = pd.read_json(dataset_path)

dataset = Dataset.from_pandas(df)

dataset.column_names

['prompt', 'test_output']

In [10]:
def generate_prompt(data_point):
    with open("prompt-template.txt", "r") as f:
        prompt_template = f.read()

    temp = "GPT4 Correct User:{prompt_template}\n{prompt}<|end_of_turn|>GPT4 Correct Assistant:{response}<|end_of_turn|>"
    text = temp.format(prompt_template=prompt_template, prompt=data_point['prompt'], response=data_point['test_output'])
    return text

In [11]:
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("input", text_column)

In [12]:
dataset

Dataset({
    features: ['prompt', 'test_output', 'input'],
    num_rows: 23440
})

In [13]:
dataset = dataset.remove_columns(column_names='prompt')

In [14]:
dataset.column_names

['test_output', 'input']

In [15]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    device_map="auto"
)

model.config.use_cache = False

# If you have issues training llama2 with LORA make sure you set config.pretraining_tp = 1. 
# It is 1 by default for the 7b models, but not for the larger ones. 
# This is only needed for the newest transformers release, the older ones do not use this parameter.
model.config.pretraining_tp = 1
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
epochs = 1 # 4 was recommended by someone on the OpenAI forum, unless dataset is very small.
per_device_train_batch_size = 2
gradient_accumulation_steps = 2
max_seq_length = 2048

In [18]:
steps_per_epoch = len(dataset)//(per_device_train_batch_size*gradient_accumulation_steps)
print("Steps per epoch:", steps_per_epoch)

Steps per epoch: 5860


In [19]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir="./results/" + fine_tuned_model,
    num_train_epochs=1,
    max_steps=steps_per_epoch,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim="paged_adamw_8bit",
    save_strategy="steps",
    evaluation_strategy="no",
    save_steps=steps_per_epoch//100,
    logging_steps=1,
    learning_rate=2e-4,
    fp16=True,
    warmup_steps=0.03,
    group_by_length=True,
    #gradient_checkpointing=True,
)

In [20]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    dataset_text_field="input"
    #packing=False
)

Map:   0%|          | 0/23440 [00:00<?, ? examples/s]

In [21]:
trainer.train(resume_from_checkpoint="results/finetuned-models/Starling-LM-7B-alpha-finetuned/checkpoint-3770")

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
3771,0.0272
3772,0.0527
3773,0.0678
3774,0.0895
3775,0.0615
3776,0.0576
3777,0.0408
3778,0.0545
3779,0.1167
3780,0.0404




TrainOutput(global_step=5860, training_loss=0.022391758393814132, metrics={'train_runtime': 28899.2146, 'train_samples_per_second': 0.811, 'train_steps_per_second': 0.203, 'total_flos': 1.8166764464556442e+18, 'train_loss': 0.022391758393814132, 'epoch': 1.0})

In [21]:
# trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,0.536
2,0.5256
3,0.4892
4,0.4581
5,0.4626
6,0.4175
7,0.4152
8,0.3574
9,0.3408
10,0.3025




In [22]:
trainer.model.save_pretrained(fine_tuned_model)