In [1]:
# %pip install -q -U bitsandbytes
# %pip install -q -U git+https://github.com/huggingface/transformers.git
# %pip install -q -U git+https://github.com/huggingface/peft.git
# %pip install -q -U git+https://github.com/huggingface/accelerate.git
# %pip install -q -U trl

In [2]:
# %pip install -q -U scipy
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer



In [3]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [5]:
# Load the entire model on the GPU 0
device_map = {"": 0}

In [6]:
from datasets import Dataset
import pandas as pd

# Load the data using pandas
data_file = "../data/ARCSolver_minimal_puzzles_20000.json"
df = pd.read_json(data_file)

# Convert the pandas dataframe to a dataset
dataset = Dataset.from_pandas(df)

def generate_prompt(data_point):
    text = '<s>[INST] ' + data_point["instruction"] + ' [/INST] ' + str(data_point["output"]) + '</s>'
    return text

# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

# dataset = dataset.train_test_split(test_size=0.2)
# train_data = dataset["train"]
# test_data = dataset["test"]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [7]:
# Load base model
model_name = "mistralai/Mistral-7B-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    # local_files_only=True  # Add this line if the model is stored locally
)
model.config.use_cache = False
# model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# Load tokenizer
base_model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
new_adapter_name = "Mistral-7B-finetuned_on_20000_minimal_puzzles"

epochs = 1 # 4 was recommended by someone on the OpenAI forum, unless dataset is very small.
per_device_train_batch_size = 2
gradient_accumulation_steps = 4
max_seq_length = 4096

output_dir = "../results/" + new_adapter_name

steps_per_epoch = len(dataset)//(per_device_train_batch_size*gradient_accumulation_steps)
print("Steps per epoch:", steps_per_epoch)

# total_steps = steps_per_epoch * epochs
# print("Total steps:", total_steps)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    max_steps=steps_per_epoch,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    save_strategy="steps",
    evaluation_strategy="no",
    save_steps=steps_per_epoch//100,
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    warmup_steps=0.03,
    group_by_length=True,
    gradient_checkpointing=True,
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="prompt",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

Steps per epoch: 2500


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
# Train model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,0.507
2,0.2023
3,0.2735
4,0.4984
5,0.3686
6,0.3326
7,0.3925
8,0.5734
9,0.6443
10,0.3292


TrainOutput(global_step=2500, training_loss=0.3703285286307335, metrics={'train_runtime': 75320.0423, 'train_samples_per_second': 0.266, 'train_steps_per_second': 0.033, 'total_flos': 2.459807721419145e+18, 'train_loss': 0.3703285286307335, 'epoch': 1.0})

In [10]:
# Save trained model
adapter_from_merged = "../adapters/" + new_adapter_name
trainer.model.save_pretrained(adapter_from_merged)