In [None]:

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class DeepSeekR1:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token  # Add padding token for GPT-2
        self.model = AutoModelForCausalLM.from_pretrained(model_name)

    def supervised_fine_tuning(self, dataset, num_epochs=1, batch_size=2, learning_rate=5e-5):
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        optimizer = optim.AdamW(self.model.parameters(), lr=learning_rate)
        self.model.train()
        for epoch in range(num_epochs):
            total_loss = 0
            for batch in dataloader:
                inputs = self.tokenizer(batch["input"], return_tensors="pt", padding=True, truncation=True)
                labels = self.tokenizer(batch["output"], return_tensors="pt", padding=True, truncation=True).input_ids
                labels[labels == self.tokenizer.pad_token_id] = -100
                outputs = self.model(**inputs, labels=labels)
                loss = outputs.loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}")

    def fine_tune_with_rl(self, prompts, expected_answers, num_epochs=1, batch_size=2, learning_rate=1e-5, clip_epsilon=0.2):
        optimizer = optim.AdamW(self.model.parameters(), lr=learning_rate)
        self.model.train()
        for epoch in range(num_epochs):
            epoch_loss = 0
            for i in range(0, len(prompts), batch_size):
                batch_prompts = prompts[i:i + batch_size]
                batch_answers = expected_answers[i:i + batch_size]
                generated_texts = [self.generate(prompt) for prompt in batch_prompts]

                # Compute rewards
                rewards = torch.tensor([
                    self.reward_fn(output, expected) for output, expected in zip(generated_texts, batch_answers)
                ], dtype=torch.float32)

                # Compute old and new log probabilities
                old_log_probs = []
                new_log_probs = []
                for prompt, generated_text in zip(batch_prompts, generated_texts):
                    inputs = self.tokenizer(prompt, return_tensors="pt")
                    outputs = self.model(**inputs, labels=inputs.input_ids)
                    old_log_probs.append(outputs.logits.mean().detach())

                    generated_inputs = self.tokenizer(generated_text, return_tensors="pt")
                    generated_outputs = self.model(**generated_inputs, labels=generated_inputs.input_ids)
                    new_log_probs.append(generated_outputs.logits.mean())

                old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32)
                new_log_probs = torch.tensor(new_log_probs, dtype=torch.float32)

                # Compute GRPO Loss
                loss = self.compute_grpo_loss(old_log_probs, new_log_probs, rewards, clip_epsilon)

                # Backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(prompts):.4f}")

    def compute_grpo_loss(self, old_log_probs, new_log_probs, rewards, clip_epsilon=0.2):
        """
        Compute the GRPO loss for policy optimization.

        Args:
            old_log_probs (torch.Tensor): Log probabilities from the old policy.
            new_log_probs (torch.Tensor): Log probabilities from the new policy.
            rewards (torch.Tensor): Rewards for the generated outputs.
            clip_epsilon (float): Clipping parameter for PPO-like stability.

        Returns:
            torch.Tensor: GRPO loss.
        """
        ratios = torch.exp(new_log_probs - old_log_probs)
        clipped_ratios = torch.clamp(ratios, 1 - clip_epsilon, 1 + clip_epsilon)
        loss = -torch.min(ratios * rewards, clipped_ratios * rewards).mean()
        return loss

    def generate(self, input_text, max_length=50):
        inputs = self.tokenizer(input_text, return_tensors="pt")
        outputs = self.model.generate(inputs.input_ids, max_length=max_length, pad_token_id=self.tokenizer.eos_token_id)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Rule-based Reward Function for Multi-Step Reasoning
def rule_based_reward(output_text, expected_answer=None, task_type="reasoning"):
    reward = 0.0

    # Format Reward: Check for proper reasoning structure
    if "<think>" in output_text and "</think>" in output_text:
        reward += 0.3  # Reward for using the correct format

    # Step-by-Step Reward: Check intermediate steps
    steps = [segment.strip() for segment in output_text.split("<think>") if "</think>" in segment]
    for step in steps:
        if step in expected_answer:  # Check if the step matches the expected reasoning
            reward += 0.2 / len(steps)  # Reward each correct step proportionally

    # Final Answer Reward: Check for correct answer
    if "[answer]" in output_text and "[/answer]" in output_text:
        answer = extract_answer(output_text)
        if answer == extract_answer(expected_answer):
            reward += 0.5

    return reward

# Helper Functions
def extract_answer(output_text):
    if "[answer]" in output_text and "[/answer]" in output_text:
        start = output_text.find("[answer]") + len("[answer]")
        end = output_text.find("[/answer]")
        return output_text[start:end].strip()
    return None

# Dataset for Multi-Step Reasoning
data = [
    {
        "input": "Why is the sky blue?",
        "output": (
            "<think>Step 1: Sunlight contains all colors of light.</think> "
            "<think>Step 2: As sunlight passes through the atmosphere, it interacts with air molecules.</think> "
            "<think>Step 3: Shorter wavelengths, like blue, scatter more than longer wavelengths, like red.</think> "
            "[answer]Rayleigh scattering[/answer]"
        )
    },
    {
        "input": "What is 2+2?",
        "output": (
            "<think>Step 1: Start with the first number: 2.</think> "
            "<think>Step 2: Add the second number: 2.</think> "
            "<think>Step 3: The result of the addition is 4.</think> "
            "[answer]4[/answer]"
        )
    }
]

prompts = ["Why is the sky blue?", "What is 2+2?"]
expected_answers = [
    (
        "<think>Step 1: Sunlight contains all colors of light.</think> "
        "<think>Step 2: As sunlight passes through the atmosphere, it interacts with air molecules.</think> "
        "<think>Step 3: Shorter wavelengths, like blue, scatter more than longer wavelengths, like red.</think> "
        "[answer]Rayleigh scattering[/answer]"
    ),
    (
        "<think>Step 1: Start with the first number: 2.</think> "
        "<think>Step 2: Add the second number: 2.</think> "
        "<think>Step 3: The result of the addition is 4.</think> "
        "[answer]4[/answer]"
    )
]

# Create Dataset Class
class ReasoningDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Example Usage
if __name__ == "__main__":
    model_name = "distilgpt2"
    dataset = ReasoningDataset(data)
    deepseek_r1 = DeepSeekR1(model_name)

    # Assign reward function for RL
    deepseek_r1.reward_fn = rule_based_reward

    # Supervised Fine-Tuning
    deepseek_r1.supervised_fine_tuning(dataset, num_epochs=1, batch_size=1)

    # Reinforcement Learning Fine-Tuning with GRPO
    deepseek_r1.fine_tune_with_rl(prompts, expected_answers, num_epochs=1, batch_size=1)



In [None]:

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class DeepSeekR1:
    def __init__(self, model_name, model_type="gpt2", mixed_precision=True):
        """
        Initialize the model with either GPT-2 or LLaMA.

        Args:
            model_name (str): Name of the pretrained model.
            model_type (str): Type of the model, either "gpt2" or "llama".
            mixed_precision (bool): Whether to use mixed precision (fp16) for memory efficiency.
        """
        self.model_type = model_type
        self.mixed_precision = mixed_precision

        if model_type == "gpt2":
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.tokenizer.pad_token = self.tokenizer.eos_token  # Add padding token for GPT-2
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name, torch_dtype=torch.float16 if mixed_precision else torch.float32
            ).cuda()
        elif model_type == "llama":
            self.tokenizer = LlamaTokenizer.from_pretrained(model_name)
            with init_empty_weights():
                self.model = LlamaForCausalLM.from_pretrained(
                    model_name, torch_dtype=torch.float16 if mixed_precision else torch.float32
                )
            self.model = load_checkpoint_and_dispatch(
                self.model, model_name, device_map="auto", offload_folder="offload"
            )
        else:
            raise ValueError("Unsupported model type. Use 'gpt2' or 'llama'.")

        # Enable gradient checkpointing for memory savings
        self.model.gradient_checkpointing_enable()



In [None]:

    def supervised_fine_tuning(self, dataset, num_epochs=1, batch_size=1, learning_rate=5e-5):
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        optimizer = optim.AdamW(self.model.parameters(), lr=learning_rate)
        self.model.train()
        for epoch in range(num_epochs):
            total_loss = 0
            for batch in dataloader:
                inputs = self.tokenizer(batch["input"], return_tensors="pt", padding=True, truncation=True).to("cuda")
                labels = self.tokenizer(batch["output"], return_tensors="pt", padding=True, truncation=True).input_ids.to("cuda")
                labels[labels == self.tokenizer.pad_token_id] = -100

                outputs = self.model(**inputs, labels=labels)
                loss = outputs.loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}")



In [None]:

    def fine_tune_with_rl(self, prompts, expected_answers, num_epochs=1, batch_size=1, learning_rate=1e-5, clip_epsilon=0.2):
        optimizer = optim.AdamW(self.model.parameters(), lr=learning_rate)
        self.model.train()
        for epoch in range(num_epochs):
            epoch_loss = 0
            for i in range(0, len(prompts), batch_size):
                batch_prompts = prompts[i:i + batch_size]
                batch_answers = expected_answers[i:i + batch_size]
                generated_texts = [self.generate(prompt) for prompt in batch_prompts]

                # Compute rewards
                rewards = torch.tensor([
                    self.reward_fn(output, expected) for output, expected in zip(generated_texts, batch_answers)
                ], dtype=torch.float32).to("cuda")

                # Compute log probabilities
                old_log_probs = []
                new_log_probs = []
                for prompt, generated_text in zip(batch_prompts, generated_texts):
                    inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
                    outputs = self.model(**inputs, labels=inputs.input_ids)
                    old_log_probs.append(outputs.logits.mean().detach())

                    generated_inputs = self.tokenizer(generated_text, return_tensors="pt").to("cuda")
                    generated_outputs = self.model(**generated_inputs, labels=generated_inputs.input_ids)
                    new_log_probs.append(generated_outputs.logits.mean())

                old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32).to("cuda")
                new_log_probs = torch.tensor(new_log_probs, dtype=torch.float32).to("cuda")

                # Compute GRPO Loss
                loss = self.compute_grpo_loss(old_log_probs, new_log_probs, rewards, clip_epsilon)

                # Backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(prompts):.4f}")

    def compute_grpo_loss(self, old_log_probs, new_log_probs, rewards, clip_epsilon=0.2):
        ratios = torch.exp(new_log_probs - old_log_probs)
        clipped_ratios = torch.clamp(ratios, 1 - clip_epsilon, 1 + clip_epsilon)
        loss = -torch.min(ratios * rewards, clipped_ratios * rewards).mean()
        return loss



In [None]:

    def generate(self, input_text, max_length=30):
        inputs = self.tokenizer(input_text, return_tensors="pt").to("cuda")
        outputs = self.model.generate(
            inputs.input_ids, max_length=max_length, pad_token_id=self.tokenizer.eos_token_id
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def evaluate(self, prompts, expected_answers):
        self.model.eval()
        correct_answers = 0
        correct_reasoning = 0

        for prompt, expected in zip(prompts, expected_answers):
            output = self.generate(prompt)
            if extract_answer(output) == extract_answer(expected):
                correct_answers += 1

            steps = [segment.strip() for segment in output.split("<think>") if "</think>" in segment]
            expected_steps = [segment.strip() for segment in expected.split("<think>") if "</think>" in segment]
            correct_reasoning += sum(1 for step in steps if step in expected_steps)

        total_prompts = len(prompts)
        reasoning_accuracy = correct_reasoning / total_prompts
        answer_accuracy = correct_answers / total_prompts

        return {
            "answer_accuracy": answer_accuracy,
            "reasoning_accuracy": reasoning_accuracy,
        }

# Rule-based reward function
def rule_based_reward(output_text, expected_answer=None):
    reward = 0.0
    if "<think>" in output_text and "</think>" in output_text:
        reward += 0.3
    steps = [segment.strip() for segment in output_text.split("<think>") if "</think>" in segment]
    for step in steps:
        if step in expected_answer:
            reward += 0.2 / len(steps)
    if "[answer]" in output_text and "[/answer]" in output_text:
        answer = extract_answer(output_text)
        if answer == extract_answer(expected_answer):
            reward += 0.5
    return reward

def extract_answer(output_text):
    if "[answer]" in output_text and "[/answer]" in output_text:
        start = output_text.find("[answer]") + len("[answer]")
        end = output_text.find("[/answer]")
        return output_text[start:end].strip()
    return None



In [None]:

if __name__ == "__main__":
    model_name = "meta-llama/Llama-2-7b-hf"
    model_type = "llama"  # Change to "gpt2" for GPT models

    dataset = [
        {"input": "Why is the sky blue?", "output": "<think>...reasoning...</think> [answer]Rayleigh scattering[/answer]"},
        {"input": "What is 2+2?", "output": "<think>...reasoning...</think> [answer]4[/answer]"}
    ]
    prompts = [item["input"] for item in dataset]
    expected_answers = [item["output"] for item in dataset]

    model = DeepSeekR1(model_name=model_name, model_type=model_type)
    print("Starting supervised fine-tuning...")
    model.supervised_fine_tuning(dataset, num_epochs=1, batch_size=1)

    print("Starting reinforcement learning...")
    model.fine_tune_with_rl(prompts, expected_answers, num_epochs=1, batch_size=1)

    print("Evaluating...")
    metrics = model.evaluate(prompts, expected_answers)
    print("Metrics:", metrics)



In [None]:

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class DeepSeekR1:
    def __init__(self, model_name, model_type="gpt2", mixed_precision=True):
        """
        Initialize the model with either GPT-2 or LLaMA.

        Args:
            model_name (str): Name of the pretrained model.
            model_type (str): Type of the model, either "gpt2" or "llama".
            mixed_precision (bool): Whether to use mixed precision (fp16) for memory efficiency.
        """
        self.model_type = model_type
        self.mixed_precision = mixed_precision

        if model_type == "gpt2":
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.tokenizer.pad_token = self.tokenizer.eos_token  # Add padding token for GPT-2
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name, torch_dtype=torch.float16 if mixed_precision else torch.float32
            ).cuda()
        elif model_type == "llama":
            self.tokenizer = LlamaTokenizer.from_pretrained(model_name)
            with init_empty_weights():
                self.model = LlamaForCausalLM.from_pretrained(
                    model_name, torch_dtype=torch.float16 if mixed_precision else torch.float32
                )
            self.model = load_checkpoint_and_dispatch(
                self.model, model_name, device_map="auto", offload_folder="offload"
            )
        else:
            raise ValueError("Unsupported model type. Use 'gpt2' or 'llama'.")

        # Enable gradient checkpointing for memory savings
        self.model.gradient_checkpointing_enable()

    def supervised_fine_tuning(self, dataset, num_epochs=1, batch_size=1, learning_rate=5e-5):
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        optimizer = optim.AdamW(self.model.parameters(), lr=learning_rate)
        self.model.train()
        for epoch in range(num_epochs):
            total_loss = 0
            for batch in dataloader:
                inputs = self.tokenizer(batch["input"], return_tensors="pt", padding=True, truncation=True).to("cuda")
                labels = self.tokenizer(batch["output"], return_tensors="pt", padding=True, truncation=True).input_ids.to("cuda")
                labels[labels == self.tokenizer.pad_token_id] = -100

                outputs = self.model(**inputs, labels=labels)
                loss = outputs.loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}")

    def fine_tune_with_rl(self, prompts, expected_answers, num_epochs=1, batch_size=1, learning_rate=1e-5, clip_epsilon=0.2):
        optimizer = optim.AdamW(self.model.parameters(), lr=learning_rate)
        self.model.train()
        for epoch in range(num_epochs):
            epoch_loss = 0
            for i in range(0, len(prompts), batch_size):
                batch_prompts = prompts[i:i + batch_size]
                batch_answers = expected_answers[i:i + batch_size]
                generated_texts = [self.generate(prompt) for prompt in batch_prompts]

                # Compute rewards
                rewards = torch.tensor([
                    self.reward_fn(output, expected) for output, expected in zip(generated_texts, batch_answers)
                ], dtype=torch.float32).to("cuda")

                # Compute log probabilities
                old_log_probs = []
                new_log_probs = []
                for prompt, generated_text in zip(batch_prompts, generated_texts):
                    inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
                    outputs = self.model(**inputs, labels=inputs.input_ids)
                    old_log_probs.append(outputs.logits.mean().detach())

                    generated_inputs = self.tokenizer(generated_text, return_tensors="pt").to("cuda")
                    generated_outputs = self.model(**generated_inputs, labels=generated_inputs.input_ids)
                    new_log_probs.append(generated_outputs.logits.mean())

                old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32).to("cuda")
                new_log_probs = torch.tensor(new_log_probs, dtype=torch.float32).to("cuda")

                # Compute GRPO Loss
                loss = self.compute_grpo_loss(old_log_probs, new_log_probs, rewards, clip_epsilon)

                # Backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(prompts):.4f}")

    def compute_grpo_loss(self, old_log_probs, new_log_probs, rewards, clip_epsilon=0.2):
        ratios = torch.exp(new_log_probs - old_log_probs)
        clipped_ratios = torch.clamp(ratios, 1 - clip_epsilon, 1 + clip_epsilon)
        loss = -torch.min(ratios * rewards, clipped_ratios * rewards).mean()
        return loss

    def generate(self, input_text, max_length=30):
        inputs = self.tokenizer(input_text, return_tensors="pt").to("cuda")
        outputs = self.model.generate(
            inputs.input_ids, max_length=max_length, pad_token_id=self.tokenizer.eos_token_id
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def evaluate(self, prompts, expected_answers):
        self.model.eval()
        correct_answers = 0
        correct_reasoning = 0

        for prompt, expected in zip(prompts, expected_answers):
            output = self.generate(prompt)
            if extract_answer(output) == extract_answer(expected):
                correct_answers += 1

            steps = [segment.strip() for segment in output.split("<think>") if "</think>" in segment]
            expected_steps = [segment.strip() for segment in expected.split("<think>") if "</think>" in segment]
            correct_reasoning += sum(1 for step in steps if step in expected_steps)

        total_prompts = len(prompts)
        reasoning_accuracy = correct_reasoning / total_prompts
        answer_accuracy = correct_answers / total_prompts

        return {
            "answer_accuracy": answer_accuracy,
            "reasoning_accuracy": reasoning_accuracy,
        }

# Rule-based reward function
def rule_based_reward(output_text, expected_answer=None):
    reward = 0.0
    if "<think>" in output_text and "</think>" in output_text:
        reward += 0.3
    steps = [segment.strip() for segment in output_text.split("<think>") if "</think>" in segment]
    for step in steps:
        if step in expected_answer:
            reward += 0.2 / len(steps)
    if "[answer]" in output_text and "[/answer]" in output_text:
        answer = extract_answer(output_text)
        if answer == extract_answer(expected_answer):
            reward += 0.5
    return reward

def extract_answer(output_text):
    if "[answer]" in output_text and "[/answer]" in output_text:
        start = output_text.find("[answer]") + len("[answer]")
        end = output_text.find("[/answer]")
        return output_text[start:end].strip()
    return None

# Dataset for Multi-Step Reasoning
data = [
    {
        "input": "Why is the sky blue?",
        "output": "<think>Step 1: Sunlight contains all colors...</think> [answer]Rayleigh scattering[/answer]"
    },
    {"input": "What is 2+2?", "output": "<think>Step 1: Add two numbers...</think> [answer]4[/answer]"},
    {"input": "What causes seasons?", "output": "..."},
    {"input": "What is gravity?", "output": "..."},
    {"input": "Why does ice float?", "output": "..."},
    # Add 5 more examples...
]

# Main
if __name__ == "__main__":
    model_name = "meta-llama/Llama-2-7b-hf"  # Or use "gpt2"
    model_type = "llama"
    dataset = ReasoningDataset(data)
    model = DeepSeekR1(model_name, model_type)
    model.supervised_fine_tuning(dataset)
    metrics = model.evaluate([x["input"] for x in data], [x["output"] for x in data])
    print(metrics)

