### dataset:sanduntg/test_vsort_classification
### model: TheBloke/Llama-2-7B-Chat-GPTQ

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U peft
%pip install -U datasets
!pip install evaluate
!pip install -U git+https://github.com/huggingface/trl
!pip install git+https://github.com/huggingface/accelerate
!pip install --upgrade transformers

In [None]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling,BitsAndBytesConfig
from trl import RewardTrainer, SFTTrainer,RewardConfig
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, TaskType

In [None]:
MODEL_PATH = "meta-llama/Llama-2-7b-hf" ##base model provided by meta. you need access to use this
DATA_PATH = "/kaggle/input/test-summarize/train.parquet"

In [None]:
df = pd.read_parquet(DATA_PATH)
df = df[:10]
raw_dataset = Dataset.from_pandas(df)
raw_dataset

In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
##defininig the model and tokenizer
hf_auth = ""## hugginface auth token
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,use_auth_token=hf_auth)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH,use_auth_token=hf_auth,device_map="auto",quantization_config=quant_config,)

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def formatting_func(examples):
    kwargs = {"padding": "max_length",
              "truncation": True,
              "max_length": 256,
              "return_tensors": "pt"
              }

    # Prepend the prompt and a line break to the original_response and response-1 fields.
    prompt_plus_chosen_response = examples["prompt"] + "\n" + examples["chosen"]
    prompt_plus_rejected_response = examples["prompt"] + "\n" + examples["rejected"]

    # Then tokenize these modified fields.
    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }

In [None]:
formatted_dataset = raw_dataset.map(formatting_func)
formatted_dataset = formatted_dataset.train_test_split()

In [None]:
model.config

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [None]:
training_args =RewardConfig(
        output_dir="rm_checkpoint/",
        max_length=256,
        num_train_epochs=1,
        logging_steps=10,
        gradient_accumulation_steps=1,
        save_strategy="steps",
        evaluation_strategy="steps",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=1,
        eval_accumulation_steps=1,
        eval_steps=500,
        save_steps=500,
        warmup_steps=100,
        logging_dir="./logs",
        learning_rate=1e-5,
        save_total_limit=1,
        no_cuda=True
)

In [None]:
trainer = RewardTrainer(model=model,
                        tokenizer=tokenizer,
                        train_dataset=formatted_dataset['train'],
                        eval_dataset=formatted_dataset['test'],
                         peft_config=peft_config,
                        args= training_args,
                        )
trainer.train()