In [None]:
!pip install peft

In [None]:
!pip install trl

In [None]:
!pip install bitsandbytes

In [None]:
!pip install -U bitsandbytes

## training code 


In [None]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer

# Set CUDA devices to use (0 and 1 for two T4 GPUs)
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

# Model and Dataset Configuration
model_name = "deepseek-ai/deepseek-math-7b-instruct"  # Replace with the actual model name if different
output_dir = "./results"

# QLoRA parameters
lora_r = 64  # LoRA attention dimension/rank
lora_alpha = 16  # LoRA scaling parameter
lora_dropout = 0.1  # LoRA dropout probability

# BitsAndBytes Configuration
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# Training Arguments
num_train_epochs = 5
per_device_train_batch_size = 4  # Batch size per GPU
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
learning_rate = 2e-4
weight_decay = 0.001
max_grad_norm = 0.3
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
warmup_ratio = 0.03
save_steps = 0
logging_steps = 50

# Load Dataset
data = pd.read_csv('/kaggle/input/aimo-24-processor-art-of-problem-solving/problems.csv')
dataset = Dataset.from_pandas(data)

# Tokenizer and Model Loading
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def preprocess_function(examples):
    inputs = [f"{problem} {solution} {answer}" for problem, solution, answer in zip(examples['problem'], examples['solution'], examples['answer'])]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['id', 'link', 'no', 'problem', 'solution', 'answer'])
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Load QLoRA config
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set Training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,  # Use fp16 for mixed precision training
    max_grad_norm=max_grad_norm,
    weight_decay=weight_decay,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    report_to="tensorboard",
)

# Load model
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Let Hugging Face automatically place the model on available devices
)

# SFT Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,  # Set max sequence length
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
)

# Start training
trainer.train()


2024-08-06 18:19:38.404389: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-06 18:19:38.404451: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-06 18:19:38.405973: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/7143 [00:00<?, ? examples/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
