<a href="https://colab.research.google.com/github/praj-pawar/generative-verifiers-using-LLMs/blob/main/sdpo_CoT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU bitsandbytes datasets accelerate loralib peft transformers trl

In [None]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import DPOTrainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from torch.utils.data import Dataset
import logging
from tqdm import tqdm
from huggingface_hub import login

login("")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class StepwiseDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        steps = item['steps']

        chosen_steps = [step['text'] for step in steps]
        rejected_steps = [step['rejected'] for step in steps]
        scores = [step['score'] for step in steps]

        chosen_text = prompt + " " + " ".join(chosen_steps)
        rejected_text = prompt + " " + " ".join(rejected_steps)

        chosen_inputs = self.tokenizer(chosen_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        rejected_inputs = self.tokenizer(rejected_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'prompt': prompt,
            'chosen': chosen_text,
            'rejected': rejected_text,
            'chosen_input_ids': chosen_inputs['input_ids'].squeeze(),
            'chosen_attention_mask': chosen_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
            'scores': torch.tensor(scores, dtype=torch.float)
        }

class StepwiseDPOTrainer(DPOTrainer):
    def __init__(self, *args):
        super().__init__(*args)
        self.reward_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")  # Replace with an actual accessible model
        self.reward_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")  # Replace with an actual accessible model

    def get_llm_score(self, prompt, step):
        # Construct input for chain-of-thought reasoning
        cot_prompt = f"""
        Problem: {prompt}
        Step: {step}

        Let's evaluate this step:
        1. Is the step logically correct?
        2. Does it contribute to solving the problem?
        3. Is it clear and well-explained?

        Reasoning:
        """

        inputs = self.reward_tokenizer(cot_prompt, return_tensors="pt", truncation=True, max_length=512).to(self.model.device)

        with torch.no_grad():
            outputs = self.reward_model.generate(**inputs, max_new_tokens=200, num_return_sequences=1)

        reasoning = self.reward_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # We're not using this score, but keeping the function signature consistent
        score = 0

        return score, reasoning

    def compute_loss(self, model, inputs):
        chosen_input_ids = inputs['chosen_input_ids']
        chosen_attention_mask = inputs['chosen_attention_mask']
        rejected_input_ids = inputs['rejected_input_ids']
        rejected_attention_mask = inputs['rejected_attention_mask']
        scores = inputs['scores']

        chosen_logits = model(input_ids=chosen_input_ids, attention_mask=chosen_attention_mask).logits
        rejected_logits = model(input_ids=rejected_input_ids, attention_mask=rejected_attention_mask).logits

        chosen_log_probs = torch.log_softmax(chosen_logits, dim=-1)
        rejected_log_probs = torch.log_softmax(rejected_logits, dim=-1)

        loss = -torch.mean(scores * (chosen_log_probs.sum(dim=-1) - rejected_log_probs.sum(dim=-1)))
        return {'loss': loss}

def process_data(sample, get_llm_score):
    problem = sample['question']['problem']
    steps = sample['label']['steps']

    processed_steps = []
    for step in steps:
        if step['chosen_completion'] is not None:
            chosen_step = step['completions'][step['chosen_completion']]
            rejected_step = step['completions'][(step['chosen_completion'] + 1) % len(step['completions'])]
        elif step['human_completion']:
            chosen_step = step['human_completion']
            rejected_step = step['completions'][0]  # Assuming at least one completion exists
        else:
            continue

        # Use the rating from the dataset as the score
        score = chosen_step['rating']

        # Get the chain of thought reasoning
        _, reasoning = get_llm_score(problem, chosen_step['text'])

        processed_steps.append({
            'text': chosen_step['text'],
            'rejected': rejected_step['text'],
            'score': score,
            'reasoning': reasoning
        })

    return {
        'prompt': problem,
        'steps': processed_steps
    }

def load_data(file_path, get_llm_score):
    dataset = load_dataset('json', data_files=file_path)
    return dataset['train'].map(
        lambda x: process_data(x, get_llm_score),
        remove_columns=dataset['train'].column_names
    )

def main():
    logger.info("Starting Stepwise DPO training with LLM reward model")

    model_name = "meta-llama/Llama-3.2-1B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    ref_model = AutoModelForCausalLM.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.pad_token_id
        ref_model.config.pad_token_id = tokenizer.pad_token_id

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    ref_model.to(device)

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, peft_config)

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        learning_rate=5e-5,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=1000,
        load_best_model_at_end=True,
        remove_unused_columns=False,
    )

    trainer = StepwiseDPOTrainer(
        model=model,
        ref_model=ref_model,
        args=training_args,
        tokenizer=tokenizer,
    )

    train_data = load_data('phase2_train.jsonl', trainer.get_llm_score)
    eval_data = load_data('phase2_test.jsonl', trainer.get_llm_score)

    train_dataset = StepwiseDataset(train_data, tokenizer, max_length=512)
    eval_dataset = StepwiseDataset(eval_data, tokenizer, max_length=512)

    trainer.train_dataset = train_dataset
    trainer.eval_dataset = eval_dataset

    trainer.train()

    trainer.save_model("./final_model")

    logger.info("Stepwise DPO training with LLM reward model completed successfully")

if __name__ == "__main__":
    main()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import DPOTrainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from torch.utils.data import Dataset
import logging
from tqdm import tqdm
from huggingface_hub import login

login("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")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class StepwiseDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        steps = item['steps']

        chosen_steps = [step['text'] for step in steps]
        rejected_steps = [step['rejected'] for step in steps]
        scores = [step['score'] for step in steps]

        chosen_text = prompt + " " + " ".join(chosen_steps)
        rejected_text = prompt + " " + " ".join(rejected_steps)

        chosen_inputs = self.tokenizer(chosen_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        rejected_inputs = self.tokenizer(rejected_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'prompt': prompt,
            'chosen': chosen_text,
            'rejected': rejected_text,
            'chosen_input_ids': chosen_inputs['input_ids'].squeeze(),
            'chosen_attention_mask': chosen_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
            'scores': torch.tensor(scores, dtype=torch.float)
        }

class StepwiseDPOTrainer(DPOTrainer):
    def __init__(self, *args):
        super().__init__(*args)
        self.reward_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")  # Replace with an actual accessible model
        self.reward_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")  # Replace with an actual accessible model

    def get_llm_reasoning(self, prompt, step):
        cot_prompt = f"""
        Problem: {prompt}
        Step: {step}

        Let's evaluate this step:
        1. Is the step logically correct?
        2. Does it contribute to solving the problem?
        3. Is it clear and well-explained?

        Reasoning:
        """

        inputs = self.reward_tokenizer(cot_prompt, return_tensors="pt", truncation=True, max_length=512).to(self.model.device)

        with torch.no_grad():
            outputs = self.reward_model.generate(**inputs, max_new_tokens=200, num_return_sequences=1)

        reasoning = self.reward_tokenizer.decode(outputs[0], skip_special_tokens=True)

        return reasoning

    def compute_loss(self, model, inputs):
        chosen_input_ids = inputs['chosen_input_ids']
        chosen_attention_mask = inputs['chosen_attention_mask']
        rejected_input_ids = inputs['rejected_input_ids']
        rejected_attention_mask = inputs['rejected_attention_mask']
        scores = inputs['scores']

        chosen_logits = model(input_ids=chosen_input_ids, attention_mask=chosen_attention_mask).logits
        rejected_logits = model(input_ids=rejected_input_ids, attention_mask=rejected_attention_mask).logits

        chosen_log_probs = torch.log_softmax(chosen_logits, dim=-1)
        rejected_log_probs = torch.log_softmax(rejected_logits, dim=-1)

        loss = -torch.mean(scores * (chosen_log_probs.sum(dim=-1) - rejected_log_probs.sum(dim=-1)))
        return {'loss': loss}

def process_data(sample, get_llm_reasoning):
    problem = sample['question']['problem']
    steps = sample['label']['steps']

    processed_steps = []
    for step in steps:
        if step['chosen_completion'] is not None:
            chosen_step = step['completions'][step['chosen_completion']]
            rejected_step = step['completions'][(step['chosen_completion'] + 1) % len(step['completions'])]
        elif step['human_completion']:
            chosen_step = step['human_completion']
            rejected_step = step['completions'][0]  # Assuming at least one completion exists
        else:
            continue

        # Use the rating from the dataset as the score
        score = chosen_step['rating']

        # Get the chain of thought reasoning
        reasoning = get_llm_reasoning(problem, chosen_step['text'])

        processed_steps.append({
            'text': chosen_step['text'],
            'rejected': rejected_step['text'],
            'score': score,
            'reasoning': reasoning
        })

    return {
        'prompt': problem,
        'steps': processed_steps
    }

def load_data(file_path, get_llm_reasoning):
    dataset = load_dataset('json', data_files=file_path)
    return dataset['train'].map(
        lambda x: process_data(x, get_llm_reasoning),
        remove_columns=dataset['train'].column_names
    )

def main():
    logger.info("Starting Stepwise DPO training with LLM reward model")

    model_name = "meta-llama/Llama-3.2-1B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    ref_model = AutoModelForCausalLM.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.pad_token_id
        ref_model.config.pad_token_id = tokenizer.pad_token_id

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    ref_model.to(device)

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, peft_config)

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        learning_rate=5e-5,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=1000,
        load_best_model_at_end=True,
        remove_unused_columns=False,
    )

    trainer = StepwiseDPOTrainer(
        model,
        ref_model,
        args=training_args,
        tokenizer=tokenizer,
    )

    train_data = load_data('phase2_train.jsonl', trainer.get_llm_reasoning)
    eval_data = load_data('phase2_test.jsonl', trainer.get_llm_reasoning)

    train_dataset = StepwiseDataset(train_data, tokenizer, max_length=512)
    eval_dataset = StepwiseDataset(eval_data, tokenizer, max_length=512)

    trainer.train_dataset = train_dataset
    trainer.eval_dataset = eval_dataset

    trainer.train()

    trainer.save_model("./final_model")

    logger.info("Stepwise DPO training with LLM reward model completed successfully")

if __name__ == "__main__":
    main()

RuntimeError: Failed to import trl.trainer.dpo_trainer because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
cannot import name 'isin_mps_friendly' from 'transformers.pytorch_utils' (/usr/local/lib/python3.10/dist-packages/transformers/pytorch_utils.py)