In [1]:
import os
import shutil

pretrained_model_name = "Llama-2-7b-chat-hf"
output_dir = "outputs " + pretrained_model_name+ "_finetuned_on_100_array_basics"
finetuned_model_folder = pretrained_model_name + "_finetuned_on_100_array_basics"

if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
    
if os.path.exists(finetuned_model_folder):
    shutil.rmtree(finetuned_model_folder)

In [2]:
%pip install -q -U bitsandbytes
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install -q -U git+https://github.com/huggingface/peft.git
%pip install -q -U git+https://github.com/huggingface/accelerate.git
%pip install -q datasets
%pip install -q scipy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [4]:
pretrained_model = "meta-llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(pretrained_model, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, add_eos_token=True)

'''
The following warnings can be ignored. I think they are caused by the fact that the model 
is loaded in a single GPU and the parameters are offloaded to the CPU:

WARNING:root:Some parameters are on the meta device device because they were offloaded to the .
WARNING:root:Some parameters are on the meta device device because they were offloaded to the cpu/disk.
'''

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]





In [5]:
import pandas as pd
import datasets
datasets.disable_progress_bar()
from datasets import Dataset

# Load the data using pandas
data_file = "ARCSolver_core_knowledge_on_basic_arrays_100.json"
df = pd.read_json(data_file)

# Convert the pandas dataframe to a dataset
data = Dataset.from_pandas(df)

In [6]:
def generate_prompt(data_point):
    text = '<s>[INST] ' + data_point["instruction"] + ' [/INST] ' + str(data_point["output"]) + '</s>'
    return text

# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in data]
data = data.add_column("prompt", text_column)

In [7]:
data = data.train_test_split(test_size=0.2)
train_data = data["train"]
test_data = data["test"]

In [8]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [9]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj","o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)

In [10]:
model.add_adapter(lora_config, adapter_name="adapter")

In [11]:
# Reload the model to avoid the following error. Don't know why, but this has to be done:" 
# Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! when resuming training"
model = AutoModelForCausalLM.from_pretrained(pretrained_model, quantization_config=bnb_config, device_map={"":0})

'''
The following warnings can be ignored. I think they are caused by the fact that the model 
is loaded in a single GPU and the parameters are offloaded to the CPU:

WARNING:root:Some parameters are on the meta device device because they were offloaded to the .
WARNING:root:Some parameters are on the meta device device because they were offloaded to the cpu/disk.
'''

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]





In [12]:
%pip install -q trl
from trl import SFTTrainer

epochs = 4 # 4 was recommended by someone on the OpenAI forum, unless dataset is very small.
per_device_train_batch_size = 1
gradient_accumulation_steps = 1
max_seq_length = 300

steps_per_epoch = len(train_data)//(per_device_train_batch_size*gradient_accumulation_steps)
print("Steps per epoch:", steps_per_epoch)

total_steps = steps_per_epoch * epochs
print("Total steps:", total_steps)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    max_seq_length=max_seq_length,
    args=TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=0.03,
        learning_rate=2e-4,
        logging_steps=steps_per_epoch,
        output_dir=output_dir,
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        evaluation_strategy="epoch",
        num_train_epochs=epochs,
        save_steps=steps_per_epoch,
        load_best_model_at_end=True,
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

'''
The following warning can be ignored as far as I can tell. Via web search: it happens to many people, 
and no fix has been found yet:

UserWarning: You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. 
This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. 
You might consider adding `tokenizer.padding_side = 'right'` to your code.
'''

Note: you may need to restart the kernel to use updated packages.
Steps per epoch: 80
Total steps: 320




In [13]:
trainer.train()

'''
The following warning can be ignored as far as I can tell. Via web search: it happens to many people
and no fix has been found yet:

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, 
using the `__call__` method is faster than using a method to encode the text followed 
by a call to the `pad` method to get a padded encoding.
'''

  0%|          | 0/320 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.5914, 'learning_rate': 0.000150014063818483, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.36166372895240784, 'eval_runtime': 14.8933, 'eval_samples_per_second': 1.343, 'eval_steps_per_second': 0.201, 'epoch': 1.0}
{'loss': 0.3149, 'learning_rate': 0.00010000937587898864, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.3546998202800751, 'eval_runtime': 15.316, 'eval_samples_per_second': 1.306, 'eval_steps_per_second': 0.196, 'epoch': 2.0}
{'loss': 0.3037, 'learning_rate': 5.000468793949432e-05, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.352175235748291, 'eval_runtime': 15.4787, 'eval_samples_per_second': 1.292, 'eval_steps_per_second': 0.194, 'epoch': 3.0}
{'loss': 0.2926, 'learning_rate': 0.0, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.35626959800720215, 'eval_runtime': 15.5522, 'eval_samples_per_second': 1.286, 'eval_steps_per_second': 0.193, 'epoch': 4.0}
{'train_runtime': 600.224, 'train_samples_per_second': 0.533, 'train_steps_per_second': 0.533, 'train_loss': 0.3756533324718475, 'epoch': 4.0}




In [14]:
trainer.save_model(finetuned_model_folder)