In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# gdrive = "/content/drive/"
# myDrive = "/content/drive/MyDrive/"
myDrive = ""

In [None]:
import os
import shutil

pretrained_model_name = "Mistral-7B-Instruct"
# /content/drive/MyDrive/ARCSolver_core_knowledge_on_basic_arrays_10000.json
output_dir = myDrive + "outputs_" + pretrained_model_name+ "_finetuned_on_1000_array_basics"
finetuned_model_folder = myDrive + pretrained_model_name + "_finetuned_on_1000_array_basics"

if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

if os.path.exists(finetuned_model_folder):
    shutil.rmtree(finetuned_model_folder)

In [None]:
# %pip install -q -U bitsandbytes
# %pip install -q -U git+https://github.com/huggingface/transformers.git
# %pip install -q -U git+https://github.com/huggingface/peft.git
# %pip install -q -U git+https://github.com/huggingface/accelerate.git
# %pip install -q datasets
# %pip install -q scipy

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
pretrained_model = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(pretrained_model, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, add_eos_token=True)

'''
The following warnings can be ignored. I think they are caused by the fact that the model
is loaded in a single GPU and the parameters are offloaded to the CPU:

WARNING:root:Some parameters are on the meta device device because they were offloaded to the .
WARNING:root:Some parameters are on the meta device device because they were offloaded to the cpu/disk.
'''

In [None]:
import pandas as pd
import datasets
datasets.disable_progress_bar()
from datasets import Dataset

# Load the data using pandas
data_file = myDrive + "ARCSolver_core_knowledge_on_basic_arrays_1000.json"
df = pd.read_json(data_file)

# Convert the pandas dataframe to a dataset
data = Dataset.from_pandas(df)

In [None]:
def generate_prompt(data_point):
    text = '<s>[INST] ' + data_point["instruction"] + ' [/INST] ' + str(data_point["output"]) + '</s>'
    return text

# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in data]
data = data.add_column("prompt", text_column)

In [None]:
data = data.train_test_split(test_size=0.2)
train_data = data["train"]
test_data = data["test"]

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj","o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)

In [None]:
model.add_adapter(lora_config, adapter_name="adapter")

In [None]:
# Reload the model to avoid the following error. Don't know why, but this has to be done:"
# Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! when resuming training"
model = AutoModelForCausalLM.from_pretrained(pretrained_model, quantization_config=bnb_config, device_map={"":0})

'''
The following warnings can be ignored. I think they are caused by the fact that the model
is loaded in a single GPU and the parameters are offloaded to the CPU:

WARNING:root:Some parameters are on the meta device device because they were offloaded to the .
WARNING:root:Some parameters are on the meta device device because they were offloaded to the cpu/disk.
'''

In [None]:
%pip install -q trl
from trl import SFTTrainer

epochs = 4 # 4 was recommended by someone on the OpenAI forum, unless dataset is very small.
per_device_train_batch_size = 2
gradient_accumulation_steps = 4
max_seq_length = 512

steps_per_epoch = len(train_data)//(per_device_train_batch_size*gradient_accumulation_steps)
print("Steps per epoch:", steps_per_epoch)

total_steps = steps_per_epoch * epochs
print("Total steps:", total_steps)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    max_seq_length=max_seq_length,
    args=TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=0.03,
        learning_rate=2e-4,
        logging_steps=steps_per_epoch,
        output_dir=output_dir,
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        evaluation_strategy="epoch",
        num_train_epochs=epochs,
        save_steps=steps_per_epoch,
        fp16=True,
        gradient_checkpointing=True,
        load_best_model_at_end=True,
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

'''
The following warning can be ignored as far as I can tell. Via web search: it happens to many people,
and no fix has been found yet:

UserWarning: You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer.
This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision.
You might consider adding `tokenizer.padding_side = 'right'` to your code.
'''

In [None]:
model.config.use_cache = False  # silence the warnings.
trainer.train()

'''
The following warning can be ignored as far as I can tell. Via web search: it happens to many people
and no fix has been found yet:

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer,
using the `__call__` method is faster than using a method to encode the text followed
by a call to the `pad` method to get a padded encoding.
'''

In [None]:
trainer.save_model(finetuned_model_folder)