# Setup

## Package Installation

In [1]:
#%pip install --upgrade pip
#%pip install transformers==4.37.0
#%pip uninstall torch torchvision torchaudio -y
#%pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 -y
#%pip install torch torchvision torchaudio
#%pip install tqdm
#%pip install numpy==1.24 #probably not needed, leave this commented
#%pip install urllib3==1.26.15
#%pip install accelerate==0.25.0
#%pip install datasets

In [2]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
import copy

import os
import gc
import re
import json
import logging
from tqdm import tqdm
from datasets import load_dataset

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print(torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('We are using the device {}.'.format(device))
if torch.cuda.is_available():
    print(f"Device count: {torch.cuda.device_count()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")

2.6.0+cu124
We are using the device cuda.
Device count: 1
Device name: NVIDIA A100-SXM4-80GB


## Utils

In [3]:
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
clear_gpu_memory()

In [4]:
# Display total GPU memory
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Display currently allocated memory
print(f"Currently allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")

# Display cached memory (reserved by PyTorch but not used)
print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

Total GPU memory: 79.15 GB
Currently allocated: 0.00 GB
Cached: 0.00 GB


In [5]:
# # Check disk space
!df -h

Filesystem                                  Size  Used Avail Use% Mounted on
devtmpfs                                    252G     0  252G   0% /dev
tmpfs                                       252G  228K  252G   1% /dev/shm
tmpfs                                       252G   59M  252G   1% /run
tmpfs                                       252G     0  252G   0% /sys/fs/cgroup
/dev/sda3                                    20G  5.3G   15G  27% /
/dev/sda2                                   994M  188M  806M  19% /boot
/dev/sda11                                  359G  401M  358G   1% /tmp
/dev/sda7                                   9.8G  479M  9.3G   5% /var
/dev/sda8                                   9.8G  299M  9.5G   3% /var/log
/dev/sda9                                   9.8G   69M  9.7G   1% /var/log/audit
/dev/sda10                                  9.8G   33M  9.8G   1% /var/tmp
vast1-mghpcc-ib.neu.edu:/discovery/home     155T  133T   23T  86% /home
vast1-mghpcc-ib.neu.edu:/vast_shared    

In [6]:
!rm -rf "results/student_model_final"

# Data Preparation

## Prompt Templates

In [7]:
COT_PROMPT_TEMPLATE = """Generate a detailed step-by-step solution for this coding problem.
Break down your thought process into clear, concise steps, explaining your reasoning at each stage.

Problem:
{problem}

Step-by-step solution:"""

In [8]:
COT_PROMPT_TEMPLATE = """Generate a detailed step-by-step solution for this coding problem.
Break down your thought process clearly, explaining your reasoning while considering:
- What are the inputs and outputs of the function?
- What algorithm or data structure is most appropriate?
- Are there any edge cases to handle?
- What's the efficiency of your approach?

Be concise in your explanation.

Problem:
{problem}

Step-by-step solution:"""

In [9]:
CODER_PROMPT_TEMPLATE = """Generate only a markdown code block that contains clean, efficient
Python code for this coding problem based on the solution approach. The code block must start
with ```python on its own line, then the code, and end with ``` on its own line. Do not include
test cases or code explanations.
Focus on:
- Implementing the key algorithmic insights
- Handling edge cases identified in the solution
- Maintaining readability and efficiency

Step-by-step solution:
{solution_cot}

Python code:"""

In [10]:
CODER_PROMPT_TEMPLATE = """Generate the python code for this coding problem. Follow the
step-by-step process as a guideline for how to solve the problem. Only return python code.

Step-by-step solution:
{solution_cot}

Python code:"""

In [11]:
DEBUGGER_PROMPT_TEMPLATE = """Check the provided python code for any errors. Then regenerate
the code so that any errors have been debugged.

Python code:
{code}

Debugged Python code:"""

In [12]:
EXPLAINER_PROMPT_TEMPLATE = """Generate an explanation of this code using the step-by-step
solution and the code itself. Keep the explanation concise.

Step-by-step solution:
{solution_cot}

Python code:
{code}

Python code explanation:"""

## Dataset Modules

In [13]:
class CodeCraftDataset(Dataset):
    """
    A generalized dataset for Code Craft agents that works with various prompt templates.

    Args:
        examples: List of dictionaries that hold all agent prompt information.
        tokenizer: Used to tokenize the inputs to the model.
        prompt_template: The prompt template string with placeholders.
        output_field: The name of the field in examples that contains the expected output.
        max_length: The maximum token length of the inputs.
    """
    def __init__(self, examples, tokenizer, prompt_template, output_field, max_length=512):
        self.examples = examples
        self.tokenizer = tokenizer
        self.prompt_template = prompt_template
        self.output_field = output_field
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        output = example[self.output_field]

        # Create prompt by formatting template with example data
        # This will use all fields from the example that match placeholders in the template
        try:
            prompt = self.prompt_template.format(**example)
        except KeyError as e:
            missing_key = str(e).strip("'")
            raise KeyError(f"Example at index {idx} is missing required field '{missing_key}' "
                          f"for prompt template: {self.prompt_template}")

        # Combine prompt with expected output
        full_text_with_output = prompt + output

        # Tokenize the combined text
        encoded = self.tokenizer(
            full_text_with_output,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Create labels (same as input_ids but with -100 for prompt tokens)
        prompt_tokens = self.tokenizer(prompt, return_tensors="pt")["input_ids"][0]
        prompt_length = len(prompt_tokens)

        labels = encoded["input_ids"].clone()
        labels[0, :prompt_length] = -100  # Don't compute loss for prompt tokens

        result = {
            "input_ids": encoded["input_ids"][0],
            "attention_mask": encoded["attention_mask"][0],
            "labels": labels[0]
        }

        return result

In [14]:
def generate_dataset(problem_dataset, task_prompt, solution_field, output_marker,
    model, tokenizer, num_examples=50, max_new_tokens=512, teacher=True, regen=False,
    output_dir="dataset"):
    """
    Generate a dataset by prompting a teacher model to solve problems for distillation.

    Args:
        problem_dataset: List of dictionaries containing problem data
        task_prompt: Prompt template string with placeholders
        solution_field: Field name for the generated solution in output examples
        output_marker: String marker after which the solution starts in the model output
                       (or None if the entire output is the solution)
        model: The model used to generate solutions
        tokenizer: Tokenizer for the model
        num_examples: Number of examples to generate
        max_new_tokens: Maximum token length for generation
        teacher: a flag indicating if the model is teacher (true) or student (f)
        regen: a flag indicating if the data should be regenerated if it already exists
        output_dir: Directory to save the generated examples

    Returns:
        List of dictionaries containing the problems and their solutions
    """
    os.makedirs(output_dir, exist_ok=True)

    # Get the model type from the teacher param
    if teacher:
        model_name = "teacher"
    else:
        model_name = "student"

    # If indicated not to regenerate the examples and they exist then return them
    file_name = os.path.join(output_dir, f"{solution_field}_{model_name}_{num_examples}_dataset.json")
    if regen and os.path.exists(file_name):
        with open(file_name, 'r') as examples_file:
            examples = json.load(examples_file)
        print("loaded examples from json")
        return examples

    examples = []
    logger.info(f"Generating {solution_field} with {model_name} for {num_examples} problems...")

    # Take a subset of problems for efficiency
    problems_subset = problem_dataset[:num_examples]

    for i, problem in enumerate(tqdm(problems_subset, desc=f"Generating {solution_field}")):
        try:
            # Format the prompt with the problem data
            prompt = task_prompt.format(**problem)

            # Tokenize the prompt
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

            # Generate the solution from the model
            model.eval()
            with torch.no_grad():
                output = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    num_return_sequences=1
                )

            # Decode the model output
            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

            # Extract the solution portion if an output marker is provided
            if output_marker and output_marker in generated_text:
                solution_start_idx = generated_text.find(output_marker) + len(output_marker)
                solution = generated_text[solution_start_idx:].strip()
            else:
                # Use the entire output if no marker is provided or found
                solution = generated_text.replace(prompt, "").strip()

            # Create the example with all original problem fields plus the solution
            example = problem.copy()  # Preserve all original fields
            example[solution_field] = solution  # Add the generated solution
            examples.append(example)

            # Save a few examples for inspection
            if i < 2:
                print(f"\nExample {i+1}:")
                print(f"Problem: {example['problem'][:150]}...")
                print(f"Solution (first 150 chars): {example[solution_field][:150]}...")

            # Log progress details periodically
            if (i + 1) % 10 == 0:
                logger.info(f"Generated {i + 1}/{len(problems_subset)} solutions")

        except Exception as e:
            logger.error(f"Error generating solution for problem {i}: {e}")
            continue

    logger.info(f"Successfully generated {len(examples)} {solution_field} solutions")

    # Save the dataset
    with open(file_name, "w") as f:
        json.dump(examples, f, indent=2)

    logger.info(f"Dataset saved to {file_name}")
    return examples

## Load Dataset Functions

In [15]:
# Load MBPP dataset
def load_mbpp_dataset():
    mbpp = load_dataset("mbpp")

    train_problems = []
    # Extract problems from the MBPP dataset with correct field names
    for item in mbpp["train"]:
        train_problems.append({
            "problem": item["text"],
            "test_case": item["test_list"],
            "solution_code": item["code"]
        })

    test_problems = []
    for item in mbpp["test"]:
        test_problems.append({
            "problem": item["text"],
            "test_case": item["test_list"],
            "solution_code": item["code"]
        })

    print(f"Loaded {len(train_problems)} train problems and {len(test_problems)} evaluation problems from MBPP dataset")
    return train_problems, test_problems

In [16]:
# Load BAAI/TACO dataset
def load_taco_dataset():
    taco = load_dataset("BAAI/TACO")

    train_problems = []
    for item in taco["train"]:
        train_problems.append({
            "problem": item["question"],
            "test_case": item["input_output"],
            "solution_code": item["solutions"][0]
        })

    test_problems = []
    for item in taco["test"]:
        train_problems.append({
            "problem": item["question"],
            "test_case": item["test_cases"],
            "solution_code": item["solutions"][0]
        })

    print(f"Loaded {len(train_problems)} train problems and {len(test_problems)} test problems from TACO dataset")
    return train_problems, test_problems

# Agent Code

## Models

In [17]:
# Load models
def load_models(teacher_model_name, student_model_name):
    logger.info(f"Loading teacher model: {teacher_model_name}")
    teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)
    teacher_model = AutoModelForCausalLM.from_pretrained(
        teacher_model_name,
        device_map="auto",
        torch_dtype=torch.float32
    )
    logger.info(f"Teacher model loaded successfully")

    logger.info(f"Loading student model: {student_model_name}")
    student_tokenizer = AutoTokenizer.from_pretrained(student_model_name)
    student_model = AutoModelForCausalLM.from_pretrained(
        student_model_name,
        device_map="auto",
        torch_dtype=torch.float32
    )
    logger.info(f"Student model loaded successfully")

    return teacher_model, teacher_tokenizer, student_model, student_tokenizer

## Training

In [18]:
def fine_tune_student_model(student_model, student_tokenizer, train_data, prompt,
                        output_field, batch_size=8, num_epochs=3, learning_rate=5e-5,
                        max_grad_norm=1.0, warmup_steps=0, max_length=512,
                        output_dir="results"):
    """
    Fine-tune the student model on examples generated by the teacher model.

    Args:
        student_model: The student model to train
        student_tokenizer: Tokenizer for the student model
        train_data: List of data dictionaries for training
        prompt: The prompt containing fields for training
        output: The output data field to train on
        batch_size: Training batch size
        num_epochs: Number of training epochs
        learning_rate: Learning rate for the optimizer
        max_grad_norm: Maximum gradient norm for gradient clipping
        warmup_steps: Linear warmup steps for the learning rate scheduler
        max_length: the maximum number of tokens in the dataset values
        output_dir: Directory to save the trained model
    """
    os.makedirs(output_dir, exist_ok=True)
    logger.info(f"Starting training the student model for {num_epochs} epochs")

    # Create PyTorch dataset and dataloader
    dataset = CodeCraftDataset(
        examples=train_data,
        tokenizer=student_tokenizer,
        prompt_template=prompt,
        output_field=output_field,
        max_length=max_length
    )
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=True
    )

    # Set up optimizer and learning rate scheduler
    optimizer = optim.AdamW(student_model.parameters(), lr=learning_rate)
    total_steps = len(dataloader) * num_epochs
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=learning_rate, total_steps=total_steps,
        pct_start=warmup_steps/total_steps if warmup_steps > 0 else 0.1
    )

    # Set up training tracking
    best_loss = float('inf')
    global_step = 0
    student_model.train()

    # Training loop
    for epoch in range(num_epochs):
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch in progress_bar:
            # Move batch to device
            input_ids = batch["input_ids"].to(student_model.device)
            attention_mask = batch["attention_mask"].to(student_model.device)
            labels = batch["labels"].to(student_model.device)

            # Forward pass - compute student model outputs
            outputs = student_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(student_model.parameters(), max_grad_norm)

            # Update parameters
            optimizer.step()
            scheduler.step()

            # Track loss
            epoch_loss += loss.item()
            global_step += 1

            # Update progress bar
            progress_bar.set_postfix({"loss": loss.item()})

            # Save checkpoint occasionally
            if global_step % 100 == 0:
                logger.info(f"Step {global_step}: loss = {loss.item():.4f}")

        # Compute average epoch loss
        avg_epoch_loss = epoch_loss / len(dataloader)
        logger.info(f"Epoch {epoch+1}/{num_epochs} - Average loss: {avg_epoch_loss:.4f}")

        # Save checkpoint if it's the best model so far
        # if avg_epoch_loss < best_loss:
        #     best_loss = avg_epoch_loss
        #     checkpoint_path = os.path.join(output_dir, f"student_model_{output_field}_epoch_{epoch+1}")
        #     logger.info(f"Saving best model so far (loss: {best_loss:.4f}) to {checkpoint_path}")
        #     student_model.save_pretrained(checkpoint_path)
        #     student_tokenizer.save_pretrained(checkpoint_path)

    # Save final model
    final_model_path = os.path.join(output_dir, f"student_model_{output_field}_final")
    logger.info(f"Training completed. Saving final model to {final_model_path}")
    student_model.save_pretrained(final_model_path)
    student_tokenizer.save_pretrained(final_model_path)

    return student_model

In [19]:
def logit_distillation_loss(student_logits, teacher_logits, temperature=1.0, alpha=0.5):
    """
    Calculate the knowledge distillation loss between student and teacher logits.

    Args:
        student_logits: Logits from the student model [batch_size, seq_len, vocab_size]
        teacher_logits: Logits from the teacher model [batch_size, seq_len, vocab_size]
        temperature: Temperature parameter to soften the distributions
        alpha: Weight for the distillation loss (1-alpha for the regular CE loss)

    Returns:
        The distillation loss
    """
    # Apply temperature scaling
    student_logits_scaled = student_logits / temperature
    teacher_logits_scaled = teacher_logits / temperature

    # Convert logits to probabilities
    student_probs = F.softmax(student_logits_scaled, dim=-1)
    teacher_probs = F.softmax(teacher_logits_scaled, dim=-1)

    # Calculate KL divergence loss
    kl_div = F.kl_div(
        F.log_softmax(student_logits, dim=-1),
        F.softmax(teacher_logits, dim=-1, dtype=torch.float32),  # Specify dtype
        reduction='batchmean',
        log_target=False  # Teacher probs are not in log space
    )

    return loss

In [20]:
def train_with_logit_distillation(
    model, train_dataloader, optimizer, scheduler=None,
    num_epochs=3, device="cuda", alpha=0.5, temperature=2.0,
    max_grad_norm=1.0):
    """
    Train a model using logit distillation.

    Args:
        model: The student model to train
        train_dataloader: DataLoader containing training examples with teacher logits
        optimizer: Optimizer for training
        scheduler: Learning rate scheduler (optional)
        num_epochs: Number of training epochs
        device: Device to use for training
        alpha: Weight for distillation loss vs standard cross-entropy loss
        temperature: Temperature for softening logit distributions
        max_grad_norm: Maximum gradient norm for clipping

    Returns:
        Trained model and training losses
    """
    model.train()
    losses = []

    # Create cross entropy loss for regular training
    ce_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100)

    for epoch in range(num_epochs):
        epoch_losses = []
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch in progress_bar:
            # Move batch to device
            model_device = next(model.parameters()).device
            input_ids = batch["input_ids"].to(model_device)
            attention_mask = batch["attention_mask"].to(model_device)
            labels = batch["labels"].to(model_device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                output_hidden_states=True
            )

            # Standard cross-entropy loss from labels
            ce_loss = outputs.loss

            # Get student logits
            student_logits = outputs.logits

            # Extract teacher logits if available and calculate distillation loss
            total_loss = ce_loss
            if "teacher_logits" in batch:
                teacher_logits = batch["teacher_logits"].to(device)

                # Make sure teacher_logits has the same shape as student_logits
                if teacher_logits.shape != student_logits.shape:
                    # Handle different sequence lengths if needed
                    min_len = min(teacher_logits.shape[1], student_logits.shape[1])
                    teacher_logits = teacher_logits[:, :min_len, :]
                    student_logits = student_logits[:, :min_len, :]

                # Calculate distillation loss
                kd_loss = logit_distillation_loss(
                    student_logits,
                    teacher_logits,
                    temperature=temperature
                )

                # Combine losses
                total_loss = (1 - alpha) * ce_loss + alpha * kd_loss

            # Backward pass
            optimizer.zero_grad()
            total_loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            # Update parameters
            optimizer.step()
            if scheduler is not None:
                scheduler.step()

            # Track loss
            epoch_losses.append(total_loss.item())
            progress_bar.set_postfix({"loss": total_loss.item()})

        # Calculate and report average loss for the epoch
        avg_loss = sum(epoch_losses) / len(epoch_losses)
        logger.info(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")
        losses.append(avg_loss)

    return model, losses

## Evaluation

In [21]:
def evaluate_student_model(student_model, student_tokenizer, test_problems, teacher_model=None,
                          batch_size=4, max_length=512, output_dir="results/evaluations"):
    """
    Evaluate the student model on a set of test problems.

    Args:
        student_model: Trained student model
        student_tokenizer: Tokenizer for the student model
        test_problems: List of test problems to evaluate on
        teacher_model: Optional teacher model for comparison
        batch_size: Batch size for evaluation
        max_length: Maximum sequence length for generation
        output_dir: Directory to save evaluation results

    Returns:
        Dictionary with evaluation metrics
    """
    os.makedirs(output_dir, exist_ok=True)
    logger.info(f"Evaluating student model on {len(test_problems)} test problems")

    # Set models to evaluation mode
    student_model.eval()
    if teacher_model is not None:
        teacher_model.eval()

    results = {
        "total_problems": len(test_problems),
        "student_generations": [],
        "teacher_generations": [] if teacher_model else None,
        "prompts": []
    }

    # Process test problems in batches
    for i in range(0, len(test_problems), batch_size):
        batch_problems = test_problems[i:i+batch_size]
        batch_prompts = []

        for problem in batch_problems:
            prompt = PROMPT_TEMPLATE.format(problem=problem["problem"])
            batch_prompts.append(prompt)
            results["prompts"].append(prompt)

        # Generate solutions with student model
        student_outputs = []
        for prompt in tqdm(batch_prompts, desc="Generating student solutions"):
            inputs = student_tokenizer(prompt, return_tensors="pt").to(student_model.device)

            student_model.eval()
            with torch.no_grad():
                output = student_model.generate(
                    **inputs,
                    max_length=max_length,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    num_return_sequences=1
                )

            decoded_output = student_tokenizer.decode(output[0], skip_special_tokens=True)
            student_outputs.append(decoded_output)

        results["student_generations"].extend(student_outputs)

        # If teacher model is provided, generate solutions for comparison
        if teacher_model:
            teacher_outputs = []
            for prompt in tqdm(batch_prompts, desc="Generating teacher solutions"):
                inputs = student_tokenizer(prompt, return_tensors="pt").to(teacher_model.device)

                with torch.no_grad():
                    output = teacher_model.generate(
                        **inputs,
                        max_length=max_length,
                        temperature=0.7,
                        do_sample=True,
                        top_p=0.9,
                        num_return_sequences=1
                    )

                decoded_output = student_tokenizer.decode(output[0], skip_special_tokens=True)
                teacher_outputs.append(decoded_output)

            results["teacher_generations"].extend(teacher_outputs)

    # Process and extract solutions
    logger.info("Processing generated solutions")
    student_solutions = []
    teacher_solutions = [] if teacher_model else None

    for output in results["student_generations"]:
        solution_start_marker = "Step-by-step solution:"
        solution_start_idx = output.find(solution_start_marker) + len(solution_start_marker)
        solution = output[solution_start_idx:].strip()
        student_solutions.append(solution)

    if teacher_model:
        for output in results["teacher_generations"]:
            solution_start_marker = "Step-by-step solution:"
            solution_start_idx = output.find(solution_start_marker) + len(solution_start_marker)
            solution = output[solution_start_idx:].strip()
            teacher_solutions.append(solution)

    # Calculate some basic metrics
    logger.info("Calculating evaluation metrics")

    # Calculate average solution length
    student_avg_length = sum(len(solution.split()) for solution in student_solutions) / len(student_solutions)
    results["student_avg_word_count"] = student_avg_length

    if teacher_model:
        teacher_avg_length = sum(len(solution.split()) for solution in teacher_solutions) / len(teacher_solutions)
        results["teacher_avg_word_count"] = teacher_avg_length
        results["length_ratio"] = student_avg_length / teacher_avg_length if teacher_avg_length > 0 else 0

    # Check for step-by-step reasoning keywords
    reasoning_keywords = ["first", "second", "third", "next", "then", "finally", "step", "let's", "because", "reason"]
    student_keyword_counts = []

    for solution in student_solutions:
        solution_lower = solution.lower()
        count = sum(1 for keyword in reasoning_keywords if keyword in solution_lower)
        student_keyword_counts.append(count)

    results["student_avg_reasoning_markers"] = sum(student_keyword_counts) / len(student_keyword_counts)

    if teacher_model:
        teacher_keyword_counts = []
        for solution in teacher_solutions:
            solution_lower = solution.lower()
            count = sum(1 for keyword in reasoning_keywords if keyword in solution_lower)
            teacher_keyword_counts.append(count)

        results["teacher_avg_reasoning_markers"] = sum(teacher_keyword_counts) / len(teacher_keyword_counts)
        results["reasoning_marker_ratio"] = (results["student_avg_reasoning_markers"] /
                                           results["teacher_avg_reasoning_markers"]
                                           if results["teacher_avg_reasoning_markers"] > 0 else 0)

    # Save a few example comparisons
    with open(os.path.join(output_dir, "solution_examples.txt"), "w") as f:
        for i in range(min(5, len(student_solutions))):
            f.write(f"Problem {i+1}:\n")
            f.write(f"{results['prompts'][i]}\n\n")
            f.write(f"Student solution:\n{student_solutions[i]}\n\n")
            if teacher_model:
                f.write(f"Teacher solution:\n{teacher_solutions[i]}\n\n")
            f.write("-" * 80 + "\n\n")

    # Save all evaluation results
    with open(os.path.join(output_dir, "evaluation_results.json"), "w") as f:
        # Create a summary version without the full generations for easier reading
        summary_results = {k: v for k, v in results.items()
                         if k not in ["student_generations", "teacher_generations", "prompts"]}
        json.dump(summary_results, f, indent=2)

    # Save the full results separately
    with open(os.path.join(output_dir, "full_results.json"), "w") as f:
        json.dump(results, f, indent=2)

    logger.info(f"Evaluation complete. Results saved to {output_dir}")
    return results

In [22]:
def track_best_model(evaluation_results, best_metrics, model_path, output_dir="results/best_model"):
    """
    Track and save the best student model based on evaluation metrics.

    Args:
        evaluation_results: Results dictionary from evaluate_student_model
        best_metrics: Dictionary with current best metrics
        model_path: Path to the current model
        output_dir: Directory to save the best model

    Returns:
        Updated best_metrics dictionary
    """
    os.makedirs(output_dir, exist_ok=True)

    # Define a scoring function to rank models (higher is better)
    # Here we prioritize reasoning marker ratio and solution length ratio
    current_score = (
        evaluation_results.get("reasoning_marker_ratio", 0) * 0.7 +
        evaluation_results.get("length_ratio", 0) * 0.3
    )

    best_score = (
        best_metrics.get("reasoning_marker_ratio", 0) * 0.7 +
        best_metrics.get("length_ratio", 0) * 0.3
    )

    # Check if current model is better than the best so far
    if current_score > best_score:
        logger.info(f"New best model found! Score: {current_score:.4f} (previous: {best_score:.4f})")

        # Update best metrics
        best_metrics = {
            "score": current_score,
            "model_path": model_path,
            "reasoning_marker_ratio": evaluation_results.get("reasoning_marker_ratio", 0),
            "length_ratio": evaluation_results.get("length_ratio", 0),
            "student_avg_reasoning_markers": evaluation_results.get("student_avg_reasoning_markers", 0),
            "student_avg_word_count": evaluation_results.get("student_avg_word_count", 0)
        }

        # Copy the model to the best model directory
        if os.path.exists(model_path):
            logger.info(f"Copying best model from {model_path} to {output_dir}")

            # Clear previous best model
            if os.path.exists(output_dir):
                for file in os.listdir(output_dir):
                    file_path = os.path.join(output_dir, file)
                    if os.path.isfile(file_path):
                        os.remove(file_path)

            # Copy new best model
            for file in os.listdir(model_path):
                source_file = os.path.join(model_path, file)
                if os.path.isfile(source_file):
                    shutil.copy(source_file, os.path.join(output_dir, file))

        # Save best metrics
        with open(os.path.join(output_dir, "best_metrics.json"), "w") as f:
            json.dump(best_metrics, f, indent=2)

    return best_metrics

# Main

## Params and Funcs

In [23]:
# Global Params
TEACHER_EXAMPLE_LEN = 5 # number of train mbpp problems
STUDENT_EXAMPLE_LEN = 5
GENERATED_TOKEN_LEN = 100

# Training Params
NUM_EPOCHS = 6
LEARNING_RATE = 2e-5
BATCH_SIZE = 10
WARMUP_STEPS = TEACHER_EXAMPLE_LEN * 0.05

In [24]:
clear_gpu_memory()

print("Loading MBPP dataset...")
mbpp_train_examples, mbpp_test_examples = load_mbpp_dataset()

print("Loading Instruct models...")
teacher_model, tokenizer, student_model, student_tokenizer = load_models("Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2.5-0.5B-Instruct")

print("Loading Coder models...")
code_teacher_model, code_tokenizer, code_student_model, code_student_tokenizer = load_models("Qwen/Qwen2.5-Coder-7B-Instruct", "Qwen/Qwen2.5-Coder-0.5B-Instruct")

# save student model initial states for efficient memory storage when fine tuning later
student_initial_state = {k: v.detach().clone() for k, v in student_model.state_dict().items()}
code_student_initial_state = {k: v.detach().clone() for k, v in code_student_model.state_dict().items()}

Loading MBPP dataset...


2025-04-18 01:43:49,745 - INFO - Loading teacher model: Qwen/Qwen2.5-7B-Instruct


Loaded 374 train problems and 500 evaluation problems from MBPP dataset
Loading Instruct models...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-18 01:43:51,477 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-04-18 01:44:00,373 - INFO - Teacher model loaded successfully
2025-04-18 01:44:00,374 - INFO - Loading student model: Qwen/Qwen2.5-0.5B-Instruct
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-18 01:44:00,659 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2025-04-18 01:44:01,332 - INFO - Student model loaded successfully
2025-04-18 01:44:01,333 - INFO - Loading teacher model: Qwen/Qwen2.5-Coder-7B-Instruct


Loading Coder models...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-18 01:44:03,088 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-04-18 01:44:11,427 - INFO - Teacher model loaded successfully
2025-04-18 01:44:11,428 - INFO - Loading student model: Qwen/Qwen2.5-Coder-0.5B-Instruct
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-18 01:44:11,724 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2025-04-18 01:44:12,371 - INFO - Student model loaded successfully


In [25]:
def generate_base_model_examples(train_problems, test_problems, teacher_model, student_model, tokenizer):
    print(f"Generating {SOLUTION_FIELD} examples using Teacher model...")
    train_examples = generate_dataset(
        train_problems,
        PROMPT_TEMPLATE,
        SOLUTION_FIELD,
        OUTPUT_MARKER,
        teacher_model,
        tokenizer,
        num_examples=TEACHER_EXAMPLE_LEN,
        max_new_tokens=GENERATED_TOKEN_LEN
    )

    print(f"Generating {SOLUTION_FIELD} examples using untrained Student model...")
    untrained_examples = generate_dataset(
        test_problems,
        PROMPT_TEMPLATE,
        SOLUTION_FIELD,
        OUTPUT_MARKER,
        student_model,
        tokenizer,
        num_examples=STUDENT_EXAMPLE_LEN,
        max_new_tokens=GENERATED_TOKEN_LEN,
        teacher=False
    )

    return train_examples, untrained_examples

In [26]:
def start_fine_tuning(student_model, tokenizer, train_examples):
  print(f"Fine-Tuning {SOLUTION_FIELD} on Student model...")
  trained_student_model = fine_tune_student_model(
      student_model=student_model,
      student_tokenizer=tokenizer,
      train_data=train_examples,
      prompt=PROMPT_TEMPLATE,
      output_field=SOLUTION_FIELD,
      batch_size=BATCH_SIZE,
      num_epochs=NUM_EPOCHS,
      learning_rate=LEARNING_RATE,
      warmup_steps=WARMUP_STEPS,
      max_length=GENERATED_TOKEN_LEN
  )

  return trained_student_model

In [27]:
def generate_trained_model_examples(test_problems, trained_student_model, tokenizer):
    print(f"Generating {SOLUTION_FIELD} examples using Trained Student model...")
    print(SOLUTION_FIELD)
    print(OUTPUT_MARKER)
    print(PROMPT_TEMPLATE)
    trained_examples = generate_dataset(
        test_problems,
        PROMPT_TEMPLATE,
        SOLUTION_FIELD,
        OUTPUT_MARKER,
        trained_student_model,
        tokenizer,
        num_examples=STUDENT_EXAMPLE_LEN+1, # add 1 to not overwrite the untrained student data file
        max_new_tokens=GENERATED_TOKEN_LEN,
        teacher=False
    )

    return trained_examples

## Restart and Run all above
In case of disk/memory filling, restart the kernel and run cells above here. Then load data generated so far from json.

## CoT Agent

In [28]:
clear_gpu_memory()
teacher_model.to(device)
code_teacher_model.cpu
student_model.to(device)
code_student_model.cpu

# CoT Agent Params
PROMPT_TEMPLATE = COT_PROMPT_TEMPLATE
SOLUTION_FIELD = "solution_cot"
OUTPUT_MARKER = "Step-by-step solution:"

In [29]:
clear_gpu_memory()

student_model.load_state_dict(student_initial_state)
train_cot_examples, untrained_cot_examples = generate_base_model_examples(
    mbpp_train_examples,
    mbpp_test_examples,
    teacher_model,
    student_model,
    tokenizer
)

2025-04-18 01:44:12,692 - INFO - Generating solution_cot with teacher for 5 problems...


Generating solution_cot examples using Teacher model...


Generating solution_cot:  20%|██        | 1/5 [00:02<00:11,  2.96s/it]


Example 1:
Problem: Write a function to find the longest chain which can be formed from the given set of pairs....
Solution (first 150 chars): 1. Understand inputs and outputs
   - Input: A list of n pairs (a, b) where 0 ≤ a < b ≤ 10^9
   - Output: The length of the longest chain that can be ...


Generating solution_cot:  40%|████      | 2/5 [00:05<00:07,  2.60s/it]


Example 2:
Problem: Write a python function to find the first repeated character in a given string....
Solution (first 150 chars): 1. Understand inputs and outputs
   - Input: A string of characters (can contain upper and lower case letters, numbers, special characters)
   - Outpu...


Generating solution_cot: 100%|██████████| 5/5 [00:12<00:00,  2.47s/it]
2025-04-18 01:44:25,053 - INFO - Successfully generated 5 solution_cot solutions
2025-04-18 01:44:25,060 - INFO - Dataset saved to dataset/solution_cot_teacher_5_dataset.json
2025-04-18 01:44:25,060 - INFO - Generating solution_cot with student for 5 problems...


Generating solution_cot examples using untrained Student model...


Generating solution_cot:  20%|██        | 1/5 [00:01<00:07,  1.84s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): 1. Define the function `remove_first_last_occurrence` that takes two parameters: `s` (the input string) and `char` (a single character).
2. Initialize...


Generating solution_cot:  40%|████      | 2/5 [00:03<00:05,  1.84s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): 1. Initialize an empty list called "result" to store the sorted matrix elements.

2. Iterate through each row of the matrix using a nested loop:

   -...


Generating solution_cot: 100%|██████████| 5/5 [00:09<00:00,  1.85s/it]
2025-04-18 01:44:34,311 - INFO - Successfully generated 5 solution_cot solutions
2025-04-18 01:44:34,320 - INFO - Dataset saved to dataset/solution_cot_student_5_dataset.json


In [30]:
# # in case of disk/memory filling, this reloads the examples from json

# clear_gpu_memory()

# mdpp_examples_file = open(f"dataset/{SOLUTION_FIELD}_teacher_{TEACHER_EXAMPLE_LEN}_dataset.json")
# train_cot_examples = json.load(mdpp_examples_file)

# print(train_cot_examples[0])

In [31]:
clear_gpu_memory()

# Fine-tune the student model
student_model.load_state_dict(student_initial_state)
trained_cot_student_model = start_fine_tuning(student_model, tokenizer, train_cot_examples)

2025-04-18 01:44:34,450 - INFO - Starting training the student model for 6 epochs


Fine-Tuning solution_cot on Student model...


Epoch 1/6: 100%|██████████| 1/1 [00:00<00:00,  3.70it/s, loss=2.92]
2025-04-18 01:44:35,718 - INFO - Epoch 1/6 - Average loss: 2.9186
Epoch 2/6: 100%|██████████| 1/1 [00:00<00:00,  6.66it/s, loss=0.616]
2025-04-18 01:44:35,869 - INFO - Epoch 2/6 - Average loss: 0.6162
Epoch 3/6: 100%|██████████| 1/1 [00:00<00:00,  6.70it/s, loss=0.341]
2025-04-18 01:44:36,020 - INFO - Epoch 3/6 - Average loss: 0.3409
Epoch 4/6: 100%|██████████| 1/1 [00:00<00:00,  6.69it/s, loss=0.341]
2025-04-18 01:44:36,170 - INFO - Epoch 4/6 - Average loss: 0.3408
Epoch 5/6: 100%|██████████| 1/1 [00:00<00:00,  6.71it/s, loss=0.302]
2025-04-18 01:44:36,320 - INFO - Epoch 5/6 - Average loss: 0.3017
Epoch 6/6: 100%|██████████| 1/1 [00:00<00:00,  6.70it/s, loss=0.274]
2025-04-18 01:44:36,471 - INFO - Epoch 6/6 - Average loss: 0.2744
2025-04-18 01:44:36,471 - INFO - Training completed. Saving final model to results/student_model_solution_cot_final


In [32]:
# # in case of disk/memory filling, this reloads the trained model from files

# trained_student_path = f"results/student_model_{SOLUTION_FIELD}_final"
# trained_student_model = AutoModelForCausalLM.from_pretrained(trained_student_path).to(device)
# trained_tokenizer = AutoTokenizer.from_pretrained(trained_student_path)

In [33]:
# Generate fine-tuned student model outputs
trained_cot_examples = generate_trained_model_examples(mbpp_test_examples, trained_cot_student_model, tokenizer)

# print("Evaluating CoT student model...")
# evaluation_results = evaluate_student_model(
#     student_model=student_model,
#     student_tokenizer=student_tokenizer,
#     test_problems=test_problems,
#     teacher_model=teacher_model,
#     batch_size=BATCH_SIZE,
#     max_length=GENERATED_TOKEN_LEN,
#     output_dir="results/evaluations"
# )

torch.cuda.empty_cache()

2025-04-18 01:44:41,381 - INFO - Generating solution_cot with student for 6 problems...


Generating solution_cot examples using Trained Student model...
solution_cot
Step-by-step solution:
Generate a detailed step-by-step solution for this coding problem.
Break down your thought process clearly, explaining your reasoning while considering:
- What are the inputs and outputs of the function?
- What algorithm or data structure is most appropriate?
- Are there any edge cases to handle?
- What's the efficiency of your approach?

Be concise in your explanation.

Problem:
{problem}

Step-by-step solution:


Generating solution_cot:  17%|█▋        | 1/6 [00:01<00:09,  1.88s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): 1. Understand inputs and outputs
   - Input: "abcdefg"
     - Outputs: "bdf"
   - Input: "abcdef"
     - Outputs: ""
   - Input: "abcedfg"
     - Outp...


Generating solution_cot:  33%|███▎      | 2/6 [00:03<00:07,  1.86s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): 1. Understand inputs & outputs
   - Input: A 2D array (matrix)
   - Output: Sorted row sums

   - Understand inputs & outputs
   - Understand inputs &...


Generating solution_cot: 100%|██████████| 6/6 [00:11<00:00,  1.86s/it]
2025-04-18 01:44:52,540 - INFO - Successfully generated 6 solution_cot solutions
2025-04-18 01:44:52,545 - INFO - Dataset saved to dataset/solution_cot_student_6_dataset.json


## Coder Agent

In [34]:
clear_gpu_memory()
teacher_model.cpu()
code_teacher_model.to(device)
student_model.cpu()
code_student_model.to(device)

# CoT Agent Params
PROMPT_TEMPLATE = CODER_PROMPT_TEMPLATE
SOLUTION_FIELD = "code"
OUTPUT_MARKER = "Python code:"

In [35]:
clear_gpu_memory()

code_student_model.load_state_dict(code_student_initial_state)
train_code_examples, untrained_code_examples = generate_base_model_examples(
    train_cot_examples,
    trained_cot_examples,
    code_teacher_model,
    code_student_model,
    tokenizer
)

2025-04-18 01:45:03,036 - INFO - Generating code with teacher for 5 problems...


Generating code examples using Teacher model...


Generating code:  20%|██        | 1/5 [00:02<00:08,  2.05s/it]


Example 1:
Problem: Write a function to find the longest chain which can be formed from the given set of pairs....
Solution (first 150 chars): ```python def findLongestChain(pairs): pairs.sort(key=lambda x: x[1]) dp = [1] * len(pairs) for i in range(1, len(pairs)): for j in range(i): if pairs...


Generating code:  40%|████      | 2/5 [00:02<00:04,  1.37s/it]


Example 2:
Problem: Write a python function to find the first repeated character in a given string....
Solution (first 150 chars): ```python def find_first_duplicate_char(s): seen = set() for char in s: if char in seen: return char seen.add(char) return None ```
```...


Generating code: 100%|██████████| 5/5 [00:09<00:00,  1.97s/it]
2025-04-18 01:45:12,907 - INFO - Successfully generated 5 code solutions
2025-04-18 01:45:12,911 - INFO - Dataset saved to dataset/code_teacher_5_dataset.json
2025-04-18 01:45:12,912 - INFO - Generating code with student for 5 problems...


Generating code examples using untrained Student model...


Generating code:  20%|██        | 1/5 [00:01<00:07,  1.86s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): ```python
def reverse_string(input_str):
    # Reverse the string using slicing
    return input_str[::-1]

# Test cases
print(reverse_string("abcdefg...


Generating code:  40%|████      | 2/5 [00:03<00:05,  1.87s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): ```python
def sorted_row_sums(matrix):
    # Sort the rows based on their sum
    sorted_rows = sorted(matrix, key=sum)
    return sorted_rows

# Exam...


Generating code: 100%|██████████| 5/5 [00:09<00:00,  1.86s/it]
2025-04-18 01:45:22,234 - INFO - Successfully generated 5 code solutions
2025-04-18 01:45:22,245 - INFO - Dataset saved to dataset/code_student_5_dataset.json


In [36]:
# # in case of disk/memory filling, this reloads the examples from json

# clear_gpu_memory()

# train_code_examples_file = open(f"dataset/{SOLUTION_FIELD}_teacher_{TEACHER_EXAMPLE_LEN}_dataset.json")
# train_code_examples = json.load(train_code_examples_file)

# print(train_code_examples[0])

# trained_cot_examples_file = open(f"dataset/solution_cot_student_{STUDENT_EXAMPLE_LEN+1}_dataset.json")
# trained_cot_examples = json.load(trained_cot_examples_file)

# print(trained_cot_examples[0])

In [37]:
clear_gpu_memory()

# Fine-tune the student model
code_student_model.load_state_dict(code_student_initial_state)
trained_code_student_model = start_fine_tuning(code_student_model, tokenizer, train_code_examples)

2025-04-18 01:45:22,400 - INFO - Starting training the student model for 6 epochs


Fine-Tuning code on Student model...


Epoch 1/6: 100%|██████████| 1/1 [00:00<00:00,  5.96it/s, loss=nan]
2025-04-18 01:45:22,571 - INFO - Epoch 1/6 - Average loss: nan
Epoch 2/6: 100%|██████████| 1/1 [00:00<00:00,  6.64it/s, loss=nan]
2025-04-18 01:45:22,723 - INFO - Epoch 2/6 - Average loss: nan
Epoch 3/6: 100%|██████████| 1/1 [00:00<00:00,  6.62it/s, loss=nan]
2025-04-18 01:45:22,876 - INFO - Epoch 3/6 - Average loss: nan
Epoch 4/6: 100%|██████████| 1/1 [00:00<00:00,  6.63it/s, loss=nan]
2025-04-18 01:45:23,028 - INFO - Epoch 4/6 - Average loss: nan
Epoch 5/6: 100%|██████████| 1/1 [00:00<00:00,  6.63it/s, loss=nan]
2025-04-18 01:45:23,180 - INFO - Epoch 5/6 - Average loss: nan
Epoch 6/6: 100%|██████████| 1/1 [00:00<00:00,  6.66it/s, loss=nan]
2025-04-18 01:45:23,331 - INFO - Epoch 6/6 - Average loss: nan
2025-04-18 01:45:23,331 - INFO - Training completed. Saving final model to results/student_model_code_final


In [38]:
# # in case of disk/memory filling, this reloads the trained model from files

# trained_student_path = f"results/student_model_{SOLUTION_FIELD}_final"
# trained_student_model = AutoModelForCausalLM.from_pretrained(trained_student_path).to(device)
# trained_tokenizer = AutoTokenizer.from_pretrained(trained_student_path)

In [39]:
# Generate Code examples using Trained Student model
trained_code_examples = generate_trained_model_examples(trained_cot_examples, trained_code_student_model, tokenizer)

# print("Evaluating CoT student model...")
# evaluation_results = evaluate_student_model(
#     student_model=student_model,
#     student_tokenizer=student_tokenizer,
#     test_problems=test_problems,
#     teacher_model=teacher_model,
#     batch_size=BATCH_SIZE,
#     max_length=GENERATED_TOKEN_LEN,
#     output_dir="results/evaluations"
# )

torch.cuda.empty_cache()  # Clear CUDA cache

2025-04-18 01:45:28,397 - INFO - Generating code with student for 6 problems...


Generating code examples using Trained Student model...
code
Python code:
Generate the python code for this coding problem. Follow the
step-by-step process as a guideline for how to solve the problem. Only return python code.

Step-by-step solution:
{solution_cot}

Python code:


Generating code:  17%|█▋        | 1/6 [00:01<00:09,  1.87s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): ```python
def remove_char(input_string):
    # Initialize an empty string to store the result
    result = ""
    
    # Iterate over each character i...


Generating code:  33%|███▎      | 2/6 [00:03<00:07,  1.87s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): To solve this problem, we can use the built-in `sorted()` function in Python. This function sorts the elements of an iterable (like a list or tuple) a...


Generating code: 100%|██████████| 6/6 [00:11<00:00,  1.87s/it]
2025-04-18 01:45:39,593 - INFO - Successfully generated 6 code solutions
2025-04-18 01:45:39,609 - INFO - Dataset saved to dataset/code_student_6_dataset.json


## Debugger Agent

In [40]:
clear_gpu_memory()
teacher_model.cpu()
code_teacher_model.to(device)
student_model.cpu()
code_student_model.to(device)

# CoT Agent Params
PROMPT_TEMPLATE = DEBUGGER_PROMPT_TEMPLATE
SOLUTION_FIELD = "debugged"
OUTPUT_MARKER = "Debugged Python code:"

In [41]:
clear_gpu_memory()

code_student_model.load_state_dict(code_student_initial_state)
train_debug_examples, untrained_debug_examples = generate_base_model_examples(
    train_code_examples,
    trained_code_examples,
    code_teacher_model,
    code_student_model,
    tokenizer
)

2025-04-18 01:45:39,871 - INFO - Generating debugged with teacher for 5 problems...


Generating debugged examples using Teacher model...


Generating debugged:  40%|████      | 2/5 [00:02<00:02,  1.18it/s]


Example 1:
Problem: Write a function to find the longest chain which can be formed from the given set of pairs....
Solution (first 150 chars): ```python def findLongestChain(pairs): pairs.sort(key=lambda x: x[1]) dp = [1] * len(pairs) for i in range(1, len(pairs)): for j in range(i): if pairs...

Example 2:
Problem: Write a python function to find the first repeated character in a given string....
Solution (first 150 chars): ...


Generating debugged: 100%|██████████| 5/5 [00:07<00:00,  1.46s/it]
2025-04-18 01:45:47,157 - INFO - Successfully generated 5 debugged solutions
2025-04-18 01:45:47,161 - INFO - Dataset saved to dataset/debugged_teacher_5_dataset.json
2025-04-18 01:45:47,162 - INFO - Generating debugged with student for 5 problems...


Generating debugged examples using untrained Student model...


Generating debugged:  20%|██        | 1/5 [00:01<00:07,  1.84s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): ```python
def remove_char(input_string):
    # Initialize an empty string to store the result
    result = ""
    
    # Iterate over each character i...


Generating debugged:  40%|████      | 2/5 [00:03<00:05,  1.84s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): ```
To solve this problem, we can use the built-in `sorted()` function in Python. This function sorts the elements of an iterable (like a list or tupl...


Generating debugged: 100%|██████████| 5/5 [00:09<00:00,  1.84s/it]
2025-04-18 01:45:56,382 - INFO - Successfully generated 5 debugged solutions
2025-04-18 01:45:56,388 - INFO - Dataset saved to dataset/debugged_student_5_dataset.json


In [42]:
# # in case of disk/memory filling, this reloads the examples from json

# clear_gpu_memory()

# train_debug_examples_file = open(f"dataset/{SOLUTION_FIELD}_teacher_{TEACHER_EXAMPLE_LEN}_dataset.json")
# train_debug_examples = json.load(train_debug_examples_file)

# print(train_debug_examples[0])

# trained_code_examples_file = open(f"dataset/code_student_{STUDENT_EXAMPLE_LEN+1}_dataset.json")
# trained_code_examples = json.load(trained_code_examples_file)

# print(trained_code_examples[0])

In [43]:
clear_gpu_memory()

# Fine-tune the student model
code_student_model.load_state_dict(code_student_initial_state)
trained_debug_student_model = start_fine_tuning(code_student_model, tokenizer, train_debug_examples)

2025-04-18 01:45:56,532 - INFO - Starting training the student model for 6 epochs


Fine-Tuning debugged on Student model...


Epoch 1/6: 100%|██████████| 1/1 [00:00<00:00,  5.96it/s, loss=12.8]
2025-04-18 01:45:56,703 - INFO - Epoch 1/6 - Average loss: 12.7680
Epoch 2/6: 100%|██████████| 1/1 [00:00<00:00,  6.65it/s, loss=2.85]
2025-04-18 01:45:56,854 - INFO - Epoch 2/6 - Average loss: 2.8532
Epoch 3/6: 100%|██████████| 1/1 [00:00<00:00,  6.68it/s, loss=0.0325]
2025-04-18 01:45:57,005 - INFO - Epoch 3/6 - Average loss: 0.0325
Epoch 4/6: 100%|██████████| 1/1 [00:00<00:00,  6.66it/s, loss=0.000152]
2025-04-18 01:45:57,156 - INFO - Epoch 4/6 - Average loss: 0.0002
Epoch 5/6: 100%|██████████| 1/1 [00:00<00:00,  6.66it/s, loss=7.74e-6]
2025-04-18 01:45:57,307 - INFO - Epoch 5/6 - Average loss: 0.0000
Epoch 6/6: 100%|██████████| 1/1 [00:00<00:00,  6.68it/s, loss=3.45e-6]
2025-04-18 01:45:57,458 - INFO - Epoch 6/6 - Average loss: 0.0000
2025-04-18 01:45:57,458 - INFO - Training completed. Saving final model to results/student_model_debugged_final


In [44]:
# # in case of disk/memory filling, this reloads the trained model from files

# trained_student_path = f"results/student_model_{SOLUTION_FIELD}_final"
# trained_student_model = AutoModelForCausalLM.from_pretrained(trained_student_path).to(device)
# trained_tokenizer = AutoTokenizer.from_pretrained(trained_student_path)

In [46]:
# Generate debug examples using Trained Student model...
trained_debug_examples = generate_trained_model_examples(trained_code_examples, trained_debug_student_model, tokenizer)

# print("Evaluating CoT student model...")
# evaluation_results = evaluate_student_model(
#     student_model=student_model,
#     student_tokenizer=student_tokenizer,
#     test_problems=test_problems,
#     teacher_model=teacher_model,
#     batch_size=BATCH_SIZE,
#     max_length=GENERATED_TOKEN_LEN,
#     output_dir="results/evaluations"
# )

torch.cuda.empty_cache()  # Clear CUDA cache

2025-04-18 01:46:38,970 - INFO - Generating debugged with student for 6 problems...


Generating debugged examples using Trained Student model...
debugged
Debugged Python code:
Check the provided python code for any errors. Then regenerate
the code so that any errors have been debugged.

Python code:
{code}

Debugged Python code:


Generating debugged: 100%|██████████| 6/6 [00:00<00:00, 42.32it/s]
2025-04-18 01:46:39,114 - INFO - Successfully generated 6 debugged solutions
2025-04-18 01:46:39,117 - INFO - Dataset saved to dataset/debugged_student_6_dataset.json



Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): ...

Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): ...


## Explainer Agent

In [47]:
clear_gpu_memory()
teacher_model.to(device)
code_teacher_model.cpu
student_model.to(device)
code_student_model.cpu

# CoT Agent Params
SOLUTION_FIELD = "explanation"
OUTPUT_MARKER = "Python code explanation:"

In [None]:
clear_gpu_memory()

#TODO: concat debug code examples to the train code examples(?)
student_model.load_state_dict(student_initial_state)
train_explain_examples, untrained_explain_examples = generate_base_model_examples(
    train_code_examples,
    trained_code_examples,
    code_teacher_model,
    code_student_model,
    tokenizer
)

2025-04-18 01:47:11,406 - INFO - Generating explanation with teacher for 5 problems...


Generating explanation examples using Teacher model...


Generating explanation:  40%|████      | 2/5 [00:02<00:03,  1.05s/it]


Example 1:
Problem: Write a function to find the longest chain which can be formed from the given set of pairs....
Solution (first 150 chars): ```python 
def findLongestChain(pairs):
    pairs.sort(key=lambda x: x[1])
    dp = [1] * len(pairs)
    for i in range(1, len(pairs)):
        for j ...

Example 2:
Problem: Write a python function to find the first repeated character in a given string....
Solution (first 150 chars): ...


Generating explanation:  80%|████████  | 4/5 [00:05<00:01,  1.44s/it]

In [None]:
# # in case of disk/memory filling, this reloads the examples from json

# clear_gpu_memory()

# mdpp_examples_file = open(f"{SOLUTION_FIELD}_teacher_{TEACHER_EXAMPLE_LEN}_dataset.json")
# train_cot_examples = json.load(mdpp_examples_file)

# print(train_cot_examples[0])

In [None]:
clear_gpu_memory()

# Fine-tune the student model
student_model.load_state_dict(student_initial_state)
trained_explain_student_model = start_fine_tuning(student_model, tokenizer, train_explain_examples)

In [None]:
# # in case of disk/memory filling, this reloads the trained model from files

# trained_student_path = f"results/student_model_{SOLUTION_FIELD}_final"
# trained_student_model = AutoModelForCausalLM.from_pretrained(trained_student_path).to(device)
# trained_tokenizer = AutoTokenizer.from_pretrained(trained_student_path)

In [None]:
# Generate explanation examples using Trained Student model
trained_explain_examples = generate_trained_model_examples(trained_code_examples, trained_explain_student_model, tokenizer)

# print("Evaluating CoT student model...")
# evaluation_results = evaluate_student_model(
#     student_model=student_model,
#     student_tokenizer=student_tokenizer,
#     test_problems=test_problems,
#     teacher_model=teacher_model,
#     batch_size=BATCH_SIZE,
#     max_length=GENERATED_TOKEN_LEN,
#     output_dir="results/evaluations"
# )

torch.cuda.empty_cache()  # Clear CUDA cache

# Extras

In [None]:
def extract_problem_description(source_code):
    """
    Extracts the problem description from the first docstring in the source code,
    whether it's enclosed in triple double quotes or triple single quotes.
    """
    docstring_pattern = re.compile(r'("""|\'\'\')(.*?)(\1)', re.DOTALL)
    match = docstring_pattern.search(source_code)

    if match:
        description = match.group(2)
        # Clean up leading/trailing whitespace on each line
        cleaned_lines = [line.strip() for line in description.strip().splitlines() if line.strip()]
        return ' '.join(cleaned_lines)

    raise Exception(f"Error: Unable to extract problem description. Please check the format of the prompt:\n{source_code}")
    return None

def extract_code_header(source_code):
    """
    Extracts everything from the beginning of the source code up to
    the first occurrence of either triple single quotes or triple double quotes.
    """
    # Match from start of string to the first triple quotes (single or double)
    docstring_pattern = re.compile(r'^(.*?)(?="""|\'\'\')', re.DOTALL)
    match = docstring_pattern.search(source_code)

    if match:
        header = match.group(1)
        # Clean up leading/trailing whitespace on each line
        cleaned_lines = [line.strip() for line in header.strip().splitlines() if line.strip()]
        return ' '.join(cleaned_lines)
    raise Exception(f"Error: Unable to extract code header. Please check the format of the prompt:\n{source_code}")
    return None

def load_human_eval_dataset():
    human_eval = load_dataset("openai_humaneval")

    train_problems = []
    # Extract problems from the MBPP dataset with correct field names
    for item in human_eval["test"]:
        train_problems.append({
            "problem": extract_problem_description(item["prompt"]),
            "code_header": extract_code_header(item["prompt"]),
            "test_case": item["prompt"],
            "solution_code": item["prompt"] + item["canonical_solution"]
        })
    return train_problems

COT_PROMPT_TEMPLATE = """Generate a detailed step-by-step solution for this coding problem.
Break down your thought process clearly, explaining your reasoning while considering:
- What are the inputs and outputs of the function?
- What algorithm or data structure is most appropriate?
- Are there any edge cases to handle?
- What's the efficiency of your approach?

Be concise in your explanation.

Problem:
{problem}

Step-by-step solution:"""

# CODER_PROMPT_TEMPLATE = """Generate only a markdown code block that contains clean, efficient
# Python code for this coding problem based on the solution approach. The code block must start
# with ```python on its own line, then the code, and end with ``` on its own line.
# Focus on:
# - Implementing the key algorithmic insights
# - Handling edge cases identified in the solution
# - Maintaining readability and efficiency
# Do not include:
# - test cases
# - extra code explanation

# Step-by-step solution:
# {cot_solution}

# Python code:
# {code_header}"""

CODER_PROMPT_TEMPLATE = """Generate only a markdown code block that contains clean, efficient
Python code for this coding problem based on the solution approach. The code block must start
with ```python on its own line, then the code, and end with ``` on its own line. Do not include
test cases or code explanations.
Focus on:
- Implementing the key algorithmic insights
- Handling edge cases identified in the solution
- Maintaining readability and efficiency

Step-by-step solution:
{cot_solution}

Python code:
{code_header}"""


human_eval = load_human_eval_dataset()
print(human_eval[0])

print("loaded dataset")

trained_cot_student_path = f"results/student_model_cot_solution_final"
trained_cot_student_model = AutoModelForCausalLM.from_pretrained(trained_cot_student_path).to(device)
trained_cot_tokenizer = AutoTokenizer.from_pretrained(trained_cot_student_path)

untrained_coder_model_name = "Qwen/Qwen2.5-Coder-0.5B"
untrained_coder_tokenizer = AutoTokenizer.from_pretrained(untrained_coder_model_name)
untrained_coder_model = AutoModelForCausalLM.from_pretrained(
    untrained_coder_model_name,
    device_map="auto",
    torch_dtype=torch.float32
)

print("Loaded models")

trained_cot_examples = generate_dataset(
    human_eval,
    COT_PROMPT_TEMPLATE,
    "cot_solution",
    "Step-by-step solution:",
    trained_cot_student_model,
    trained_cot_tokenizer,
    num_examples=100,
    max_new_tokens=512,
    teacher=False
)

print("cot examples generated")

code_examples = generate_dataset(
    trained_cot_examples,
    CODER_PROMPT_TEMPLATE,
    "gen_code",
    "Python code:",
    untrained_coder_model,
    untrained_coder_tokenizer,
    num_examples=100,
    max_new_tokens=512,
    teacher=False
)

print("code generated")



In [None]:
i = 22

#print('\nproblem:')
#print(new_code_examples[i]['problem'])
#print('\ncot')
#print(new_code_examples[i]['cot_solution'])
#print('\ngenerated_code')
print(new_code_examples[i]['gen_code'])

In [None]:
def extract_before_def(source_code):
    """
    Extracts everything from the beginning of the source code up to
    but not including the first occurrence of the 'def' keyword.
    Preserves original formatting.
    """
    pattern = re.compile(r'^(.*?)(?=def)', re.DOTALL)
    match = pattern.search(source_code)

    if match:
        return match.group(1)
    raise Exception(f"Error: Unable to extract content before 'def'. No 'def' keyword found in:\n{source_code}")
    return None

def extract_until_code_block(source_code):
    """
    Extracts everything from the beginning of the string up to
    but not including the first occurrence of three backticks (```).
    Preserves original formatting.
    """
    pattern = re.compile(r'^(.*?)(?=```)', re.DOTALL)
    match = pattern.search(source_code)

    if match:
        return match.group(1)
    return 'BAD'

solutions = [item['solution_code'] for item in new_code_examples]
generated_codes = [item['gen_code'] for item in new_code_examples]
for i, generated_code in enumerate(generated_codes):
    generated_codes[i] = extract_before_def(solutions[i]) + extract_until_code_block(generated_codes[i])
print(generated_codes[0])

def remove_bad_strings(string_array):
    """
    Removes any strings containing 'BAD' from the given array.
    Also prints the indices of removed strings.

    Args:
        string_array: A list of strings to filter

    Returns:
        A new list with all strings containing 'BAD' removed
    """
    bad_indices = []
    clean_strings = []

    for i, s in enumerate(string_array):
        if 'BAD' in s:
            bad_indices.append(i)
        else:
            clean_strings.append(s)

    # Print the indices of bad strings
    if bad_indices:
        print(f"Found 'BAD' in strings at indices: {bad_indices}")
    else:
        print("No strings containing 'BAD' found.")

    return clean_strings, bad_indices

edited_codes, bad_indices = remove_bad_strings(generated_codes)
print(len(edited_codes))

In [None]:
human_eval['test']

In [None]:
%pip install evaluate

from evaluate import load

# Load evaluation metric
code_eval = load("code_eval")

import os
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

problems = human_eval
test = []
for i, item in enumerate(edited_codes):
    edited_codes[i] = [item]
pred = edited_codes
c = 0

for i, s in enumerate(human_eval[:100]):
    if i not in bad_indices:
        test.append(s)
        c = c+1
        print(c)

pass_at_k = code_eval.compute(
        predictions=pred,
        references=test,
        k=[1]
)
print(pass_at_k)
print(pass_at_k[0]['pass@1']*100)

In [None]:
# Generate CoT (Chain of Thought) dataset
cot_examples = generate_dataset(
    problem_dataset=mbpp_problems,
    task_prompt=COT_PROMPT_TEMPLATE,
    solution_field="solution_cot",
    output_marker="Step-by-step solution:",
    teacher_model=teacher_model,
    teacher_tokenizer=teacher_tokenizer,
    num_examples=50,
    output_file="datasets/cot_dataset.json"
)

# Generate code dataset from CoT
code_examples = generate_dataset(
    problem_dataset=cot_examples,  # Use the output from CoT as input
    task_prompt=DEVELOPER_PROMPT_TEMPLATE,
    solution_field="code",
    output_marker="Python code:",
    teacher_model=teacher_model,
    teacher_tokenizer=teacher_tokenizer,
    num_examples=50,
    output_file="datasets/code_dataset.json"
)

# Generate debugged code dataset
debugged_examples = generate_dataset(
    problem_dataset=code_examples,  # Use the code examples as input
    task_prompt=DEBUGGER_PROMPT_TEMPLATE,
    solution_field="debugged_code",
    output_marker="Debugged Python code:",
    teacher_model=teacher_model,
    teacher_tokenizer=teacher_tokenizer,
    num_examples=50,
    output_file="datasets/debugged_code_dataset.json"
)

# Generate code explanations
explanation_examples = generate_dataset(
    problem_dataset=code_examples,  # Use code examples that also have CoT
    task_prompt=EXPLAINER_PROMPT_TEMPLATE,
    solution_field="explanation",
    output_marker="Explanation of the code:",
    teacher_model=teacher_model,
    teacher_tokenizer=teacher_tokenizer,
    num_examples=50,
    output_file="datasets/explanation_dataset.json"
)

In [None]:
for i, example in enumerate(mbpp_problems):
  print(f"Problem number: {i}")
  print(f"Problem: {example['problem']}")
  print("Test cases:")
  print(example['test_case'])
  print("Code Solution:")
  print(example['solution'])