In [None]:
!pip install -U bitsandbytes
!pip install trl
!pip install datasets<4.0.0

/bin/bash: line 1: 4.0.0: No such file or directory


In [28]:
import torch
import torch.nn as nn
from typing import Any
import pandas as pd
import tokenizers
from torch.utils.data import DataLoader, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training,  AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
from trl import SFTTrainer
from datasets import load_dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [29]:
# HYPERPARAMS
SEED = 42
LORA_RANK=32
BATCH_SIZE = 2
GRADIENT_ACCUMULATION=16
DATA_MAX_LEN = 768
EPOCHS = 3
DROPOUT = 0.1
WEIGHT_DECAY = 0.01
TEMPERATURE = 0.8
GRAD_CLIP = 1 #0.3
WARMUP_RATIO = 0.01
LEARNING_RATE = 0.0002

In [20]:
model_name = "Qwen/Qwen3-8B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
    )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    torch_dtype = torch.bfloat16,
    quantization_config = bnb_config,
    trust_remote_code=True
  )
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [21]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_RANK * 2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
        ],
    lora_dropout=DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"

)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 87,293,952 || all params: 8,278,029,312 || trainable%: 1.0545


In [30]:
def retrieve_data(dataset_name, tokenizer):
    dataset = load_dataset("casehold/casehold", "all")
    dataset['holdings'] = dataset['holding_0'] + " " + dataset['holding_1'] + " " + dataset['holding_2'] + " " + dataset['holding_3'] + " " + dataset['holding_4']
    split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = split_dataset['train']
    test_dataset = split_dataset['test']

    return train_dataset, test_dataset

In [31]:
train_dataset, test_dataset = retrieve_data('casehold/casehold', tokenizer)

training_args = TrainingArguments(
    output_dir="finetuned_model",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    max_grad_norm=GRAD_CLIP,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    optim="paged_adamw_8bit",
    warmup_ratio=WARMUP_RATIO,
    report_to="none",
    bf16=True, # Enable bfloat16 training
    load_best_model_at_end=True,
    gradient_checkpointing=True
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
)
trainer.train()

RuntimeError: Dataset scripts are no longer supported, but found casehold.py