In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
from trl import SFTTrainer
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
data = pd.read_csv("/home/bhx5gh/Documents/CBM/CBM_Final_Project/Data/instruct_tune_dataset/prompt_dataset.csv", quoting=csv.QUOTE_NONNUMERIC,
            escapechar='\\',
            encoding='utf-8')

# Select the desired columns
comments = data[["prompt", "response"]].sample(frac=0.3, random_state=42)

# Split into train, validation, and test sets
train_comments, test_comments = train_test_split(comments, test_size=0.1, random_state=42)

# Create Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_comments.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_comments.reset_index(drop=True))

# Combine into DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response'],
        num_rows: 31902
    })
    test: Dataset({
        features: ['prompt', 'response'],
        num_rows: 3545
    })
})


In [3]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

model = AutoModelForCausalLM.from_pretrained(
    "/home/bhx5gh/Documents/CBM/CBM_Final_Project/src/LLM/Local Models/Mistral-7b-Instruct-v0_3",
    device_map='auto',
    use_cache=False
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

tokenizer = AutoTokenizer.from_pretrained("/home/bhx5gh/Documents/CBM/CBM_Final_Project/src/LLM/Local Models/Mistral-7b-Instruct-v0_3")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 3/3 [00:14<00:00,  4.98s/it]


In [4]:
args = TrainingArguments(
  output_dir = "/home/bhx5gh/Documents/CBM/CBM_Final_Project/checkpoints",
  num_train_epochs=2,
  #max_steps = 100, 
  per_device_train_batch_size = 24,
  per_device_eval_batch_size = 32,
  warmup_steps = 100,
  save_strategy="steps",
  evaluation_strategy="steps",
  eval_steps = 600,
  save_steps = 600,
  learning_rate=2e-5,
  bf16=True,
  lr_scheduler_type='constant',
  eval_on_start = True
)

def create_prompt(sample):
  prompt = sample['prompt']
  response = sample['response']
  return f'{prompt}\n{response}'



In [5]:
max_seq_length = 200

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True,
  formatting_func=create_prompt,
  args=args,
  train_dataset=dataset["train"],
  eval_dataset=dataset["test"]
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [6]:
trainer.train()

Step,Training Loss,Validation Loss
0,No log,2.853468
600,2.258800,2.135086
