# Setup

## Package Installation

In [1]:
#%pip install --upgrade pip
#%pip install transformers==4.37.0
#%pip uninstall torch torchvision torchaudio -y
#%pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 -y
#%pip install torch torchvision torchaudio
#%pip install tqdm
#%pip install urllib3==1.26.15
#%pip install accelerate==0.25.0
#%pip install datasets

In [2]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
import copy

import os
import gc
import re
import json
import logging
from tqdm import tqdm
from datasets import load_dataset

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print(torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('We are using the device {}.'.format(device))
if torch.cuda.is_available():
    print(f"Device count: {torch.cuda.device_count()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")

1.13.1+cu116
We are using the device cuda.
Device count: 1
Device name: NVIDIA A100-SXM4-80GB


## Utils

In [3]:
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
clear_gpu_memory()

In [4]:
# Display total GPU memory
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Display currently allocated memory
print(f"Currently allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")

# Display cached memory (reserved by PyTorch but not used)
print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

Total GPU memory: 79.15 GB
Currently allocated: 0.00 GB
Cached: 0.00 GB


In [5]:
# # Check disk space
!df -h

Filesystem                                  Size  Used Avail Use% Mounted on
devtmpfs                                    252G     0  252G   0% /dev
tmpfs                                       252G  8.0K  252G   1% /dev/shm
tmpfs                                       252G   59M  252G   1% /run
tmpfs                                       252G     0  252G   0% /sys/fs/cgroup
/dev/sda3                                    20G  5.3G   15G  27% /
/dev/sda2                                   994M  188M  806M  19% /boot
/dev/sda11                                  359G  401M  358G   1% /tmp
/dev/sda7                                   9.8G  479M  9.3G   5% /var
/dev/sda8                                   9.8G  299M  9.5G   3% /var/log
/dev/sda9                                   9.8G   72M  9.7G   1% /var/log/audit
/dev/sda10                                  9.8G   33M  9.8G   1% /var/tmp
vast1-mghpcc-ib.neu.edu:/discovery/home     155T  133T   23T  86% /home
vast1-mghpcc-ib.neu.edu:/vast_shared    

In [6]:
import sys

def check_versions():
    # Python version
    print("Python:", sys.version.split()[0])
    
    # PyTorch and CUDA
    try:
        import torch
        print("PyTorch:", torch.__version__)
        print("CUDA available:", torch.cuda.is_available())
        if torch.cuda.is_available():
            print("PyTorch CUDA version:", torch.version.cuda)
    except ImportError:
        print("PyTorch: Not installed")
    
    # Key packages
    try:
        import transformers
        print("transformers:", transformers.__version__)
    except ImportError:
        print("transformers: Not installed")
        
    try:
        import accelerate
        print("accelerate:", accelerate.__version__)
    except ImportError:
        print("accelerate: Not installed")
    
#     try:
#         import bitsandbytes
#         print("bitsandbytes:", bitsandbytes.__version__)
#     except ImportError:
#         print("bitsandbytes: Not installed")
    
    try:
        import huggingface_hub
        print("huggingface_hub:", huggingface_hub.__version__)
    except ImportError:
        print("huggingface_hub: Not installed")
    
    try:
        import datasets
        print("datasets:", datasets.__version__)
    except ImportError:
        print("datasets: Not installed")

check_versions()

Python: 3.9.12
PyTorch: 1.13.1+cu116
CUDA available: True
PyTorch CUDA version: 11.6
transformers: 4.37.0
accelerate: 0.25.0
huggingface_hub: 0.30.1
datasets: 3.5.0


# Data Preparation

## Prompt Templates

In [7]:
COT_PROMPT_TEMPLATE = """Provide ONE concise algorithm strategy for this coding problem in EXACTLY 4 numbered points:

1. Input/output: Single sentence describing parameters and return value
2. Approach: Name the exact algorithm/data structure
3. Key steps: 3-4 bullet points with specific algorithmic operations
4. Edge cases: 2-3 specific edge conditions, no explanations needed

Keep total response short. Be direct and technical.
DO NOT include pseudocode, explanations, test cases, or implementation details.

Problem:
{problem}

Algorithm strategy:"""

In [8]:
CODER_PROMPT_TEMPLATE = """Generate only the Python code implementation for this problem.
Problem:
{problem}

Using this algorithm strategy:
{solution_cot}

STRICT REQUIREMENTS:
- Your output must begin with ```python
- Your output must end with ```
- ONLY write clean, efficient Python code
- NO text before or after the code block
- NO descriptions of what the code does

Python code:"""

In [9]:
DEBUGGER_PROMPT_TEMPLATE = """Fix all bugs and inefficiencies in this Python code.

Problem:
{problem}

Original code:
{code}

CRITICAL REQUIREMENTS:
- Output MUST start with ```python and end with ``` ONLY
- NO explanations before or after the code
- NO test cases or example output
- NO justification of your changes
- MINIMAL code changes to fix bugs/inefficiencies

Debugged code:"""

In [10]:
EXPLAINER_PROMPT_TEMPLATE = """Create a short, beginner-friendly explanation of this code.

Problem:
{problem}

Code to explain:
{code}

Keep your explanation to 1-2 paragraphs.
Focus on:
- What the code accomplishes
- The core algorithm approach used
- One insightful observation about why it works
- Any clever tricks worth noting

Use friendly language that makes the solution approachable.

Explanation:"""

## Dataset Modules

In [11]:
class CodeCraftDataset(Dataset):
    """
    A generalized dataset for Code Craft agents that works with various prompt templates.

    Args:
        examples: List of dictionaries that hold all agent prompt information.
        tokenizer: Used to tokenize the inputs to the model.
        prompt_template: The prompt template string with placeholders.
        output_field: The name of the field in examples that contains the expected output.
        max_length: The maximum token length of the inputs.
    """
    def __init__(self, examples, tokenizer, prompt_template, output_field, max_length=512):
        self.examples = examples
        self.tokenizer = tokenizer
        self.prompt_template = prompt_template
        self.output_field = output_field
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        output = example[self.output_field]

        # Create prompt by formatting template with example data
        # This will use all fields from the example that match placeholders in the template
        try:
            prompt = self.prompt_template.format(**example)
        except KeyError as e:
            missing_key = str(e).strip("'")
            raise KeyError(f"Example at index {idx} is missing required field '{missing_key}' "
                          f"for prompt template: {self.prompt_template}")

        # Combine prompt with expected output
        full_text_with_output = prompt + output

        # Tokenize the combined text
        encoded = self.tokenizer(
            full_text_with_output,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Create labels (same as input_ids but with -100 for prompt tokens)
        prompt_tokens = self.tokenizer(prompt, return_tensors="pt")["input_ids"][0]
        prompt_length = len(prompt_tokens)

        labels = encoded["input_ids"].clone()
        labels[0, :prompt_length] = -100  # Don't compute loss for prompt tokens

        result = {
            "input_ids": encoded["input_ids"][0],
            "attention_mask": encoded["attention_mask"][0],
            "labels": labels[0]
        }

        return result

In [12]:
def generate_dataset(problem_dataset, task_prompt, solution_field, output_marker,
    model, tokenizer, num_examples=50, max_new_tokens=512, temperature=0.7, teacher=True, regen=False,
    output_dir="dataset"):
    """
    Generate a dataset by prompting a teacher model to solve problems for distillation.

    Args:
        problem_dataset: List of dictionaries containing problem data
        task_prompt: Prompt template string with placeholders
        solution_field: Field name for the generated solution in output examples
        output_marker: String marker after which the solution starts in the model output
                       (or None if the entire output is the solution)
        model: The model used to generate solutions
        tokenizer: Tokenizer for the model
        num_examples: Number of examples to generate
        max_new_tokens: Maximum token length for generation
        teacher: a flag indicating if the model is teacher (true) or student (f)
        regen: a flag indicating if the data should be regenerated if it already exists
        output_dir: Directory to save the generated examples

    Returns:
        List of dictionaries containing the problems and their solutions
    """
    os.makedirs(output_dir, exist_ok=True)

    # Get the model type from the teacher param
    if teacher:
        model_name = "teacher"
    else:
        model_name = "student"

    # If indicated not to regenerate the examples and they exist then return them
    file_name = os.path.join(output_dir, f"{solution_field}_{model_name}_{num_examples}_dataset.json")
    if regen and os.path.exists(file_name):
        with open(file_name, 'r') as examples_file:
            examples = json.load(examples_file)
        print("loaded examples from json")
        return examples

    examples = []
    logger.info(f"Generating {solution_field} with {model_name} for {num_examples} problems...")

    # Take a subset of problems for efficiency
    problems_subset = problem_dataset[:num_examples]

    for i, problem in enumerate(tqdm(problems_subset, desc=f"Generating {solution_field}")):
        try:
            # Format the prompt with the problem data
            prompt = task_prompt.format(**problem)

            # Tokenize the prompt
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

            # Generate the solution from the model
            model.eval()
            with torch.no_grad():
                output = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                    num_return_sequences=1
                )

            # Decode the model output
            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

            # Extract the solution portion if an output marker is provided
            if output_marker and output_marker in generated_text:
                solution_start_idx = generated_text.find(output_marker) + len(output_marker)
                solution = generated_text[solution_start_idx:].strip()
            else:
                # Use the entire output if no marker is provided or found
                solution = generated_text.replace(prompt, "").strip()

            # Create the example with all original problem fields plus the solution
            example = problem.copy()  # Preserve all original fields
            example[solution_field] = solution  # Add the generated solution
            examples.append(example)

            # Save a few examples for inspection
            if i < 2:
                print(f"\nExample {i+1}:")
                print(f"Problem: {example['problem'][:150]}...")
                print(f"Solution (first 150 chars): {example[solution_field][:150]}...")

            # Log progress details periodically
            if (i + 1) % 10 == 0:
                logger.info(f"Generated {i + 1}/{len(problems_subset)} solutions")

        except Exception as e:
            logger.error(f"Error generating solution for problem {i}: {e}")
            continue

    logger.info(f"Successfully generated {len(examples)} {solution_field} solutions")

    # Save the dataset
    with open(file_name, "w") as f:
        json.dump(examples, f, indent=2)

    logger.info(f"Dataset saved to {file_name}")
    return examples

## Load Dataset Functions

In [13]:
# Load MBPP dataset
def load_mbpp_dataset():
    mbpp = load_dataset("mbpp")

    train_problems = []
    # Extract problems from the MBPP dataset with correct field names
    for item in mbpp["train"]:
        train_problems.append({
            "problem": item["text"],
            "test_case": item["test_list"],
            "solution_code": item["code"]
        })

    test_problems = []
    for item in mbpp["test"]:
        test_problems.append({
            "problem": item["text"],
            "test_case": item["test_list"],
            "solution_code": item["code"]
        })

    print(f"Loaded {len(train_problems)} train problems and {len(test_problems)} evaluation problems from MBPP dataset")
    return train_problems, test_problems

In [14]:
# Load BAAI/TACO dataset
def load_taco_dataset():
    taco = load_dataset("BAAI/TACO")

    train_problems = []
    for item in taco["train"]:
        train_problems.append({
            "problem": item["question"],
            "test_case": item["input_output"],
            "solution_code": item["solutions"][0]
        })

    test_problems = []
    for item in taco["test"]:
        train_problems.append({
            "problem": item["question"],
            "test_case": item["test_cases"],
            "solution_code": item["solutions"][0]
        })

    print(f"Loaded {len(train_problems)} train problems and {len(test_problems)} test problems from TACO dataset")
    return train_problems, test_problems

# Agent Code

## Models

In [15]:
# Load models
def load_models(teacher_model_name, student_model_name):
    logger.info(f"Loading teacher model: {teacher_model_name}")
    teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)
    teacher_model = AutoModelForCausalLM.from_pretrained(
        teacher_model_name,
        device_map="auto",
        torch_dtype=torch.float32
    )
    logger.info(f"Teacher model loaded successfully")

    logger.info(f"Loading student model: {student_model_name}")
    student_tokenizer = AutoTokenizer.from_pretrained(student_model_name)
    student_model = AutoModelForCausalLM.from_pretrained(
        student_model_name,
        device_map="auto",
        torch_dtype=torch.float32
    )
    logger.info(f"Student model loaded successfully")

    return teacher_model, teacher_tokenizer, student_model, student_tokenizer

## Training

In [16]:
def fine_tune_student_model(student_model, student_tokenizer, train_data, prompt,
                        output_field, batch_size=8, num_epochs=3, learning_rate=5e-5,
                        max_grad_norm=1.0, warmup_steps=0, max_length=512,
                        output_dir="results"):
    """
    Fine-tune the student model on examples generated by the teacher model.

    Args:
        student_model: The student model to train
        student_tokenizer: Tokenizer for the student model
        train_data: List of data dictionaries for training
        prompt: The prompt containing fields for training
        output: The output data field to train on
        batch_size: Training batch size
        num_epochs: Number of training epochs
        learning_rate: Learning rate for the optimizer
        max_grad_norm: Maximum gradient norm for gradient clipping
        warmup_steps: Linear warmup steps for the learning rate scheduler
        max_length: the maximum number of tokens in the dataset values
        output_dir: Directory to save the trained model
    """
    os.makedirs(output_dir, exist_ok=True)
    logger.info(f"Starting training the student model for {num_epochs} epochs")

    # Create PyTorch dataset and dataloader
    dataset = CodeCraftDataset(
        examples=train_data,
        tokenizer=student_tokenizer,
        prompt_template=prompt,
        output_field=output_field,
        max_length=max_length
    )
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=True
    )

    # Set up optimizer and learning rate scheduler
    optimizer = optim.AdamW(student_model.parameters(), lr=learning_rate)
    total_steps = len(dataloader) * num_epochs
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=learning_rate, total_steps=total_steps,
        pct_start=warmup_steps/total_steps if warmup_steps > 0 else 0.1
    )

    # Set up training tracking
    best_loss = float('inf')
    global_step = 0
    student_model.train()

    # Training loop
    for epoch in range(num_epochs):
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch in progress_bar:
            # Move batch to device
            input_ids = batch["input_ids"].to(student_model.device)
            attention_mask = batch["attention_mask"].to(student_model.device)
            labels = batch["labels"].to(student_model.device)

            # Forward pass - compute student model outputs
            outputs = student_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(student_model.parameters(), max_grad_norm)

            # Update parameters
            optimizer.step()
            scheduler.step()

            # Track loss
            epoch_loss += loss.item()
            global_step += 1

            # Update progress bar
            progress_bar.set_postfix({"loss": loss.item()})

            # Save checkpoint occasionally
            if global_step % 100 == 0:
                logger.info(f"Step {global_step}: loss = {loss.item():.4f}")

        # Compute average epoch loss
        avg_epoch_loss = epoch_loss / len(dataloader)
        logger.info(f"Epoch {epoch+1}/{num_epochs} - Average loss: {avg_epoch_loss:.4f}")

        # Save checkpoint if it's the best model so far
        # if avg_epoch_loss < best_loss:
        #     best_loss = avg_epoch_loss
        #     checkpoint_path = os.path.join(output_dir, f"student_model_{output_field}_epoch_{epoch+1}")
        #     logger.info(f"Saving best model so far (loss: {best_loss:.4f}) to {checkpoint_path}")
        #     student_model.save_pretrained(checkpoint_path)
        #     student_tokenizer.save_pretrained(checkpoint_path)

    # Save final model
    final_model_path = os.path.join(output_dir, f"student_model_{output_field}_final")
    logger.info(f"Training completed. Saving final model to {final_model_path}")
    student_model.save_pretrained(final_model_path)
    student_tokenizer.save_pretrained(final_model_path)

    return student_model

## Evaluation

In [17]:
def evaluate_student_model(student_model, student_tokenizer, test_problems, teacher_model=None,
                          batch_size=4, max_length=512, temperature=0.7, output_dir="results/evaluations"):
    """
    Evaluate the student model on a set of test problems.

    Args:
        student_model: Trained student model
        student_tokenizer: Tokenizer for the student model
        test_problems: List of test problems to evaluate on
        teacher_model: Optional teacher model for comparison
        batch_size: Batch size for evaluation
        max_length: Maximum sequence length for generation
        output_dir: Directory to save evaluation results

    Returns:
        Dictionary with evaluation metrics
    """
    os.makedirs(output_dir, exist_ok=True)
    logger.info(f"Evaluating student model on {len(test_problems)} test problems")

    # Set models to evaluation mode
    student_model.eval()
    if teacher_model is not None:
        teacher_model.eval()

    results = {
        "total_problems": len(test_problems),
        "student_generations": [],
        "teacher_generations": [] if teacher_model else None,
        "prompts": []
    }

    # Process test problems in batches
    for i in range(0, len(test_problems), batch_size):
        batch_problems = test_problems[i:i+batch_size]
        batch_prompts = []

        for problem in batch_problems:
            prompt = PROMPT_TEMPLATE.format(problem=problem["problem"])
            batch_prompts.append(prompt)
            results["prompts"].append(prompt)

        # Generate solutions with student model
        student_outputs = []
        for prompt in tqdm(batch_prompts, desc="Generating student solutions"):
            inputs = student_tokenizer(prompt, return_tensors="pt").to(student_model.device)

            student_model.eval()
            with torch.no_grad():
                output = student_model.generate(
                    **inputs,
                    max_length=max_length,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                    num_return_sequences=1
                )

            decoded_output = student_tokenizer.decode(output[0], skip_special_tokens=True)
            student_outputs.append(decoded_output)

        results["student_generations"].extend(student_outputs)

        # If teacher model is provided, generate solutions for comparison
        if teacher_model:
            teacher_outputs = []
            for prompt in tqdm(batch_prompts, desc="Generating teacher solutions"):
                inputs = student_tokenizer(prompt, return_tensors="pt").to(teacher_model.device)

                with torch.no_grad():
                    output = teacher_model.generate(
                        **inputs,
                        max_length=max_length,
                        temperature=temperature,
                        do_sample=True,
                        top_p=0.9,
                        num_return_sequences=1
                    )

                decoded_output = student_tokenizer.decode(output[0], skip_special_tokens=True)
                teacher_outputs.append(decoded_output)

            results["teacher_generations"].extend(teacher_outputs)

    # Process and extract solutions
    logger.info("Processing generated solutions")
    student_solutions = []
    teacher_solutions = [] if teacher_model else None

    for output in results["student_generations"]:
        solution_start_marker = "Step-by-step solution:"
        solution_start_idx = output.find(solution_start_marker) + len(solution_start_marker)
        solution = output[solution_start_idx:].strip()
        student_solutions.append(solution)

    if teacher_model:
        for output in results["teacher_generations"]:
            solution_start_marker = "Step-by-step solution:"
            solution_start_idx = output.find(solution_start_marker) + len(solution_start_marker)
            solution = output[solution_start_idx:].strip()
            teacher_solutions.append(solution)

    # Calculate some basic metrics
    logger.info("Calculating evaluation metrics")

    # Calculate average solution length
    student_avg_length = sum(len(solution.split()) for solution in student_solutions) / len(student_solutions)
    results["student_avg_word_count"] = student_avg_length

    if teacher_model:
        teacher_avg_length = sum(len(solution.split()) for solution in teacher_solutions) / len(teacher_solutions)
        results["teacher_avg_word_count"] = teacher_avg_length
        results["length_ratio"] = student_avg_length / teacher_avg_length if teacher_avg_length > 0 else 0

    # Check for step-by-step reasoning keywords
    reasoning_keywords = ["first", "second", "third", "next", "then", "finally", "step", "let's", "because", "reason"]
    student_keyword_counts = []

    for solution in student_solutions:
        solution_lower = solution.lower()
        count = sum(1 for keyword in reasoning_keywords if keyword in solution_lower)
        student_keyword_counts.append(count)

    results["student_avg_reasoning_markers"] = sum(student_keyword_counts) / len(student_keyword_counts)

    if teacher_model:
        teacher_keyword_counts = []
        for solution in teacher_solutions:
            solution_lower = solution.lower()
            count = sum(1 for keyword in reasoning_keywords if keyword in solution_lower)
            teacher_keyword_counts.append(count)

        results["teacher_avg_reasoning_markers"] = sum(teacher_keyword_counts) / len(teacher_keyword_counts)
        results["reasoning_marker_ratio"] = (results["student_avg_reasoning_markers"] /
                                           results["teacher_avg_reasoning_markers"]
                                           if results["teacher_avg_reasoning_markers"] > 0 else 0)

    # Save a few example comparisons
    with open(os.path.join(output_dir, "solution_examples.txt"), "w") as f:
        for i in range(min(5, len(student_solutions))):
            f.write(f"Problem {i+1}:\n")
            f.write(f"{results['prompts'][i]}\n\n")
            f.write(f"Student solution:\n{student_solutions[i]}\n\n")
            if teacher_model:
                f.write(f"Teacher solution:\n{teacher_solutions[i]}\n\n")
            f.write("-" * 80 + "\n\n")

    # Save all evaluation results
    with open(os.path.join(output_dir, "evaluation_results.json"), "w") as f:
        # Create a summary version without the full generations for easier reading
        summary_results = {k: v for k, v in results.items()
                         if k not in ["student_generations", "teacher_generations", "prompts"]}
        json.dump(summary_results, f, indent=2)

    # Save the full results separately
    with open(os.path.join(output_dir, "full_results.json"), "w") as f:
        json.dump(results, f, indent=2)

    logger.info(f"Evaluation complete. Results saved to {output_dir}")
    return results

In [18]:
def track_best_model(evaluation_results, best_metrics, model_path, output_dir="results/best_model"):
    """
    Track and save the best student model based on evaluation metrics.

    Args:
        evaluation_results: Results dictionary from evaluate_student_model
        best_metrics: Dictionary with current best metrics
        model_path: Path to the current model
        output_dir: Directory to save the best model

    Returns:
        Updated best_metrics dictionary
    """
    os.makedirs(output_dir, exist_ok=True)

    # Define a scoring function to rank models (higher is better)
    # Here we prioritize reasoning marker ratio and solution length ratio
    current_score = (
        evaluation_results.get("reasoning_marker_ratio", 0) * 0.7 +
        evaluation_results.get("length_ratio", 0) * 0.3
    )

    best_score = (
        best_metrics.get("reasoning_marker_ratio", 0) * 0.7 +
        best_metrics.get("length_ratio", 0) * 0.3
    )

    # Check if current model is better than the best so far
    if current_score > best_score:
        logger.info(f"New best model found! Score: {current_score:.4f} (previous: {best_score:.4f})")

        # Update best metrics
        best_metrics = {
            "score": current_score,
            "model_path": model_path,
            "reasoning_marker_ratio": evaluation_results.get("reasoning_marker_ratio", 0),
            "length_ratio": evaluation_results.get("length_ratio", 0),
            "student_avg_reasoning_markers": evaluation_results.get("student_avg_reasoning_markers", 0),
            "student_avg_word_count": evaluation_results.get("student_avg_word_count", 0)
        }

        # Copy the model to the best model directory
        if os.path.exists(model_path):
            logger.info(f"Copying best model from {model_path} to {output_dir}")

            # Clear previous best model
            if os.path.exists(output_dir):
                for file in os.listdir(output_dir):
                    file_path = os.path.join(output_dir, file)
                    if os.path.isfile(file_path):
                        os.remove(file_path)

            # Copy new best model
            for file in os.listdir(model_path):
                source_file = os.path.join(model_path, file)
                if os.path.isfile(source_file):
                    shutil.copy(source_file, os.path.join(output_dir, file))

        # Save best metrics
        with open(os.path.join(output_dir, "best_metrics.json"), "w") as f:
            json.dump(best_metrics, f, indent=2)

    return best_metrics

# Main

## Params and Funcs

In [19]:
# Global Params
TEACHER_EXAMPLE_LEN = 374 # number of train mbpp problems
STUDENT_EXAMPLE_LEN = 50
GENERATED_TOKEN_LEN = 512

# Training Params
NUM_EPOCHS = 6
LEARNING_RATE = 2e-5
BATCH_SIZE = 10
WARMUP_STEPS = TEACHER_EXAMPLE_LEN * 0.05

In [20]:
clear_gpu_memory()

print("Loading MBPP dataset...")
mbpp_train_examples, mbpp_test_examples = load_mbpp_dataset()

# print("Loading Instruct models...")
# teacher_model, tokenizer, student_model, student_tokenizer = load_models("Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2.5-0.5B-Instruct")

# print("Loading Coder models...")
# code_teacher_model, code_tokenizer, code_student_model, code_student_tokenizer = load_models("Qwen/Qwen2.5-Coder-7B-Instruct", "Qwen/Qwen2.5-Coder-0.5B-Instruct")

# save student model initial states for efficient memory storage when fine tuning later
# student_initial_state = {k: v.detach().clone() for k, v in student_model.state_dict().items()}
# code_student_initial_state = {k: v.detach().clone() for k, v in code_student_model.state_dict().items()}

Loading MBPP dataset...
Loaded 374 train problems and 500 evaluation problems from MBPP dataset


In [21]:
def generate_base_model_examples(train_problems, test_problems, teacher_model, student_model, tokenizer):
    print(f"Generating {SOLUTION_FIELD} examples using Teacher model...")
    train_examples = generate_dataset(
        train_problems,
        PROMPT_TEMPLATE,
        SOLUTION_FIELD,
        OUTPUT_MARKER,
        teacher_model,
        tokenizer,
        num_examples=TEACHER_EXAMPLE_LEN,
        max_new_tokens=GENERATED_TOKEN_LEN,
        temperature=TEMPERATURE
    )

    print(f"Generating {SOLUTION_FIELD} examples using untrained Student model...")
    untrained_examples = generate_dataset(
        test_problems,
        PROMPT_TEMPLATE,
        SOLUTION_FIELD,
        OUTPUT_MARKER,
        student_model,
        tokenizer,
        num_examples=STUDENT_EXAMPLE_LEN,
        max_new_tokens=GENERATED_TOKEN_LEN,
        temperature=TEMPERATURE,
        teacher=False
    )

    return train_examples, untrained_examples

In [22]:
def start_fine_tuning(student_model, tokenizer, train_examples):
  print(f"Fine-Tuning {SOLUTION_FIELD} on Student model...")
  trained_student_model = fine_tune_student_model(
      student_model=student_model,
      student_tokenizer=tokenizer,
      train_data=train_examples,
      prompt=PROMPT_TEMPLATE,
      output_field=SOLUTION_FIELD,
      batch_size=BATCH_SIZE,
      num_epochs=NUM_EPOCHS,
      learning_rate=LEARNING_RATE,
      warmup_steps=WARMUP_STEPS,
      max_length=GENERATED_TOKEN_LEN
  )

  return trained_student_model

In [23]:
def generate_trained_model_examples(test_problems, trained_student_model, tokenizer):
    print(f"Generating {SOLUTION_FIELD} examples using Trained Student model...")
    trained_examples = generate_dataset(
        test_problems,
        PROMPT_TEMPLATE,
        SOLUTION_FIELD,
        OUTPUT_MARKER,
        trained_student_model,
        tokenizer,
        num_examples=STUDENT_EXAMPLE_LEN+1, # add 1 to not overwrite the untrained student data file
        max_new_tokens=GENERATED_TOKEN_LEN,
        temperature=TEMPERATURE,
        teacher=False
    )

    return trained_examples

## Restart and Run all above
In case of disk/memory filling, restart the kernel and run cells above here. Then load data generated so far from json.

## CoT Agent

In [24]:
clear_gpu_memory()
# teacher_model.to(device)
# code_teacher_model.cpu
# student_model.to(device)
# code_student_model.cpu

# CoT Agent Params
PROMPT_TEMPLATE = COT_PROMPT_TEMPLATE
SOLUTION_FIELD = "solution_cot"
OUTPUT_MARKER = "Step-by-step solution:"
GENERATED_TOKEN_LEN = 150
TEMPERATURE=0.6

print("Loading Instruct models...")
teacher_model, tokenizer, student_model, student_tokenizer = load_models("Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2.5-0.5B-Instruct")
student_initial_state = {k: v.detach().clone() for k, v in student_model.state_dict().items()}

2025-04-19 19:47:43,088 - INFO - Loading teacher model: Qwen/Qwen2.5-7B-Instruct
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading Instruct models...


2025-04-19 19:47:44,833 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-04-19 19:47:53,626 - INFO - Teacher model loaded successfully
2025-04-19 19:47:53,627 - INFO - Loading student model: Qwen/Qwen2.5-0.5B-Instruct
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-19 19:47:53,880 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2025-04-19 19:47:54,691 - INFO - Student model loaded successfully


In [25]:
clear_gpu_memory()

student_model.load_state_dict(student_initial_state)
train_cot_examples, untrained_cot_examples = generate_base_model_examples(
    mbpp_train_examples,
    mbpp_test_examples,
    teacher_model,
    student_model,
    tokenizer
)

2025-04-19 19:47:57,316 - INFO - Generating solution_cot with teacher for 374 problems...


Generating solution_cot examples using Teacher model...


Generating solution_cot:   0%|          | 1/374 [00:04<25:22,  4.08s/it]


Example 1:
Problem: Write a function to find the longest chain which can be formed from the given set of pairs....
Solution (first 150 chars): Greedy

1. Input/output: Given a list of n pairs of integers [(a1, b1), (a2, b2), ..., (an, bn)], return an integer representing the length of the lon...


Generating solution_cot:   1%|          | 2/374 [00:07<23:25,  3.78s/it]


Example 2:
Problem: Write a python function to find the first repeated character in a given string....
Solution (first 150 chars): Hash Table

1. Input/output: Given a string; return the first repeated character as a string
2. Approach: Use a hash table to track character occurren...


Generating solution_cot:   2%|▏         | 9/374 [00:32<21:25,  3.52s/it]2025-04-19 19:48:33,198 - INFO - Generated 10/374 solutions
Generating solution_cot:   5%|▌         | 19/374 [01:07<21:04,  3.56s/it]2025-04-19 19:49:08,768 - INFO - Generated 20/374 solutions
Generating solution_cot:   8%|▊         | 29/374 [01:43<20:30,  3.57s/it]2025-04-19 19:49:44,415 - INFO - Generated 30/374 solutions
Generating solution_cot:  10%|█         | 39/374 [02:19<19:56,  3.57s/it]2025-04-19 19:50:20,058 - INFO - Generated 40/374 solutions
Generating solution_cot:  13%|█▎        | 49/374 [02:53<19:05,  3.53s/it]2025-04-19 19:50:54,711 - INFO - Generated 50/374 solutions
Generating solution_cot:  16%|█▌        | 59/374 [03:29<18:41,  3.56s/it]2025-04-19 19:51:30,326 - INFO - Generated 60/374 solutions
Generating solution_cot:  18%|█▊        | 69/374 [04:05<18:06,  3.56s/it]2025-04-19 19:52:05,937 - INFO - Generated 70/374 solutions
Generating solution_cot:  21%|██        | 79/374 [04:40<17:31,  3.56s/

Generating solution_cot examples using untrained Student model...


Generating solution_cot:   2%|▏         | 1/50 [00:02<02:11,  2.68s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): 1. Define an empty list named 'chars' to store characters that need to be removed.
2. Use a while loop to iterate through each character in the input ...


Generating solution_cot:   4%|▍         | 2/50 [00:05<02:08,  2.67s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): 1. Calculate the sum of each row.
2. Sort these sums in ascending order.
3. Construct the sorted matrix by placing each element at its corresponding p...


Generating solution_cot:  18%|█▊        | 9/50 [00:24<01:50,  2.69s/it]2025-04-19 20:10:24,699 - INFO - Generated 10/50 solutions
Generating solution_cot:  38%|███▊      | 19/50 [00:51<01:23,  2.68s/it]2025-04-19 20:10:51,524 - INFO - Generated 20/50 solutions
Generating solution_cot:  58%|█████▊    | 29/50 [01:17<00:56,  2.69s/it]2025-04-19 20:11:18,440 - INFO - Generated 30/50 solutions
Generating solution_cot:  78%|███████▊  | 39/50 [01:44<00:29,  2.68s/it]2025-04-19 20:11:45,252 - INFO - Generated 40/50 solutions
Generating solution_cot:  98%|█████████▊| 49/50 [02:11<00:02,  2.68s/it]2025-04-19 20:12:12,054 - INFO - Generated 50/50 solutions
Generating solution_cot: 100%|██████████| 50/50 [02:14<00:00,  2.69s/it]
2025-04-19 20:12:12,055 - INFO - Successfully generated 50 solution_cot solutions
2025-04-19 20:12:12,061 - INFO - Dataset saved to dataset/solution_cot_student_50_dataset.json


In [26]:
# # in case of disk/memory filling, this reloads the examples from json

# clear_gpu_memory()

# mdpp_examples_file = open(f"dataset/{SOLUTION_FIELD}_teacher_{TEACHER_EXAMPLE_LEN}_dataset.json")
# train_cot_examples = json.load(mdpp_examples_file)

# print(train_cot_examples[0])

In [27]:
clear_gpu_memory()

# Fine-tune the student model
student_model.load_state_dict(student_initial_state)
trained_cot_student_model = start_fine_tuning(student_model, tokenizer, train_cot_examples)

2025-04-19 20:12:12,223 - INFO - Starting training the student model for 6 epochs


Fine-Tuning solution_cot on Student model...


Epoch 1/6: 100%|██████████| 38/38 [00:13<00:00,  2.84it/s, loss=0.761]
2025-04-19 20:12:25,593 - INFO - Epoch 1/6 - Average loss: 0.8007
Epoch 2/6: 100%|██████████| 38/38 [00:13<00:00,  2.85it/s, loss=0.34] 
2025-04-19 20:12:38,917 - INFO - Epoch 2/6 - Average loss: 0.4000
Epoch 3/6:  61%|██████    | 23/38 [00:08<00:05,  2.82it/s, loss=0.302]2025-04-19 20:12:47,436 - INFO - Step 100: loss = 0.3015
Epoch 3/6: 100%|██████████| 38/38 [00:13<00:00,  2.85it/s, loss=0.116]
2025-04-19 20:12:52,234 - INFO - Epoch 3/6 - Average loss: 0.2299
Epoch 4/6: 100%|██████████| 38/38 [00:13<00:00,  2.85it/s, loss=0.132] 
2025-04-19 20:13:05,551 - INFO - Epoch 4/6 - Average loss: 0.1140
Epoch 5/6: 100%|██████████| 38/38 [00:13<00:00,  2.85it/s, loss=0.0257]
2025-04-19 20:13:18,864 - INFO - Epoch 5/6 - Average loss: 0.0462
Epoch 6/6:  24%|██▎       | 9/38 [00:03<00:10,  2.82it/s, loss=0.041] 2025-04-19 20:13:22,414 - INFO - Step 200: loss = 0.0410
Epoch 6/6: 100%|██████████| 38/38 [00:13<00:00,  2.85it/s, 

In [28]:
# # in case of disk/memory filling, this reloads the trained model from files

# trained_student_path = f"results/student_model_{SOLUTION_FIELD}_final"
# trained_student_model = AutoModelForCausalLM.from_pretrained(trained_student_path).to(device)
# trained_tokenizer = AutoTokenizer.from_pretrained(trained_student_path)

In [29]:
# Generate fine-tuned student model outputs
trained_cot_examples = generate_trained_model_examples(mbpp_test_examples, trained_cot_student_model, tokenizer)

# print("Evaluating CoT student model...")
# evaluation_results = evaluate_student_model(
#     student_model=student_model,
#     student_tokenizer=student_tokenizer,
#     test_problems=test_problems,
#     teacher_model=teacher_model,
#     batch_size=BATCH_SIZE,
#     max_length=GENERATED_TOKEN_LEN,
#     output_dir="results/evaluations"
# )

del teacher_model, tokenizer
del student_model, student_tokenizer
torch.cuda.empty_cache()

2025-04-19 20:13:36,892 - INFO - Generating solution_cot with student for 51 problems...


Generating solution_cot examples using Trained Student model...


Generating solution_cot:   2%|▏         | 1/51 [00:02<02:16,  2.72s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): Rabin-Karp String Manipulation

1. Input/output: A string and a character; returns modified string without first and last occurrences
2. Approach: Mod...


Generating solution_cot:   4%|▍         | 2/51 [00:05<02:13,  2.72s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): Greedy Algorithm

1. Input/output: Given a matrix, return the sorted matrix.
2. Approach: Use the Greedy Algorithm to calculate the sum of elements in...


Generating solution_cot:  18%|█▊        | 9/51 [00:24<01:53,  2.70s/it]2025-04-19 20:14:03,933 - INFO - Generated 10/51 solutions
Generating solution_cot:  37%|███▋      | 19/51 [00:51<01:26,  2.70s/it]2025-04-19 20:14:30,944 - INFO - Generated 20/51 solutions
Generating solution_cot:  57%|█████▋    | 29/51 [01:18<00:59,  2.70s/it]2025-04-19 20:14:57,990 - INFO - Generated 30/51 solutions
Generating solution_cot:  76%|███████▋  | 39/51 [01:45<00:32,  2.71s/it]2025-04-19 20:15:25,070 - INFO - Generated 40/51 solutions
Generating solution_cot:  96%|█████████▌| 49/51 [02:12<00:05,  2.71s/it]2025-04-19 20:15:52,112 - INFO - Generated 50/51 solutions
Generating solution_cot: 100%|██████████| 51/51 [02:17<00:00,  2.70s/it]
2025-04-19 20:15:54,832 - INFO - Successfully generated 51 solution_cot solutions
2025-04-19 20:15:54,837 - INFO - Dataset saved to dataset/solution_cot_student_51_dataset.json


## Coder Agent

In [25]:
clear_gpu_memory()

# CoT Agent Params
PROMPT_TEMPLATE = CODER_PROMPT_TEMPLATE
SOLUTION_FIELD = "code"
OUTPUT_MARKER = "Python code:"
GENERATED_TOKEN_LEN = 512
TEMPERATURE=0.7

print("Loading Coder models...")
code_teacher_model, tokenizer, code_student_model, code_student_tokenizer = load_models("Qwen/Qwen2.5-Coder-7B-Instruct", "Qwen/Qwen2.5-Coder-0.5B-Instruct")
code_student_initial_state = {k: v.detach().clone() for k, v in code_student_model.state_dict().items()}

2025-04-19 20:46:55,168 - INFO - Loading teacher model: Qwen/Qwen2.5-Coder-7B-Instruct
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading Coder models...


2025-04-19 20:46:56,863 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-04-19 20:47:06,777 - INFO - Teacher model loaded successfully
2025-04-19 20:47:06,778 - INFO - Loading student model: Qwen/Qwen2.5-Coder-0.5B-Instruct
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-19 20:47:07,033 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2025-04-19 20:47:07,848 - INFO - Student model loaded successfully


In [31]:
# train_cot_examples_file = open(f"dataset/solution_cot_teacher_{TEACHER_EXAMPLE_LEN}_dataset.json")
# train_cot_examples = json.load(train_cot_examples_file)

# print(train_cot_examples[0])

# trained_cot_examples_file = open(f"dataset/solution_cot_student_{STUDENT_EXAMPLE_LEN+1}_dataset.json")
# trained_cot_examples = json.load(trained_cot_examples_file)

# print(trained_cot_examples[0])

In [32]:
clear_gpu_memory()

code_student_model.load_state_dict(code_student_initial_state)
train_code_examples, untrained_code_examples = generate_base_model_examples(
    train_cot_examples,
    trained_cot_examples,
    code_teacher_model,
    code_student_model,
    tokenizer
)

2025-04-19 20:22:12,289 - INFO - Generating code with teacher for 374 problems...


Generating code examples using Teacher model...


Generating code:   0%|          | 1/374 [00:03<23:57,  3.85s/it]


Example 1:
Problem: Write a function to find the longest chain which can be formed from the given set of pairs....
Solution (first 150 chars): ```python
def findLongestChain(pairs):
    if not pairs:
        return 0
    
    # Sort pairs by their second element
    pairs.sort(key=lambda x: x...


Generating code:   1%|          | 2/374 [00:05<14:44,  2.38s/it]


Example 2:
Problem: Write a python function to find the first repeated character in a given string....
Solution (first 150 chars): ```python
def first_repeated_char(s):
    char_dict = {}
    for char in s:
        if char in char_dict:
            return char
        else:
      ...


Generating code:   2%|▏         | 9/374 [00:21<19:43,  3.24s/it]2025-04-19 20:22:35,161 - INFO - Generated 10/374 solutions
Generating code:   5%|▌         | 19/374 [00:55<21:35,  3.65s/it]2025-04-19 20:23:10,569 - INFO - Generated 20/374 solutions
Generating code:   8%|▊         | 29/374 [01:28<16:37,  2.89s/it]2025-04-19 20:23:43,913 - INFO - Generated 30/374 solutions
Generating code:  10%|█         | 39/374 [02:01<17:28,  3.13s/it]2025-04-19 20:24:15,999 - INFO - Generated 40/374 solutions
Generating code:  13%|█▎        | 49/374 [02:36<19:53,  3.67s/it]2025-04-19 20:24:49,998 - INFO - Generated 50/374 solutions
Generating code:  16%|█▌        | 59/374 [03:00<10:12,  1.94s/it]2025-04-19 20:25:13,183 - INFO - Generated 60/374 solutions
Generating code:  18%|█▊        | 69/374 [03:23<12:13,  2.40s/it]2025-04-19 20:25:36,995 - INFO - Generated 70/374 solutions
Generating code:  21%|██        | 79/374 [03:42<11:11,  2.28s/it]2025-04-19 20:25:56,861 - INFO - Generated 80/374 solutions
G

Generating code examples using untrained Student model...


Generating code:   2%|▏         | 1/50 [00:06<04:59,  6.11s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): ```python
def remove_first_and_last_occurrence(s, char):
    # Calculate prefix sums array
    n = len(s)
    prefix_sums = [0] * (n + 1)
    
    # C...


Generating code:   4%|▍         | 2/50 [00:10<04:11,  5.24s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): ```python
def sort_matrix_by_row_sum(matrix):
    """
    Sorts a given matrix in ascending order based on the sum of its rows.

    :param matrix: A ...


Generating code:  18%|█▊        | 9/50 [00:49<03:32,  5.18s/it]2025-04-19 20:41:26,352 - INFO - Generated 10/50 solutions
Generating code:  38%|███▊      | 19/50 [01:40<02:53,  5.59s/it]2025-04-19 20:42:18,630 - INFO - Generated 20/50 solutions
Generating code:  58%|█████▊    | 29/50 [02:27<01:49,  5.23s/it]2025-04-19 20:43:06,884 - INFO - Generated 30/50 solutions
Generating code:  78%|███████▊  | 39/50 [03:18<00:51,  4.66s/it]2025-04-19 20:43:57,804 - INFO - Generated 40/50 solutions
Generating code:  98%|█████████▊| 49/50 [04:04<00:03,  3.70s/it]2025-04-19 20:44:41,269 - INFO - Generated 50/50 solutions
Generating code: 100%|██████████| 50/50 [04:07<00:00,  4.96s/it]
2025-04-19 20:44:41,270 - INFO - Successfully generated 50 code solutions
2025-04-19 20:44:41,277 - INFO - Dataset saved to dataset/code_student_50_dataset.json


In [30]:
# in case of disk/memory filling, this reloads the examples from json

clear_gpu_memory()

train_code_examples_file = open(f"dataset/{SOLUTION_FIELD}_teacher_{TEACHER_EXAMPLE_LEN}_dataset.json")
train_code_examples = json.load(train_code_examples_file)

print(train_code_examples[0])

trained_cot_examples_file = open(f"dataset/solution_cot_student_{STUDENT_EXAMPLE_LEN+1}_dataset.json")
trained_cot_examples = json.load(trained_cot_examples_file)

print(trained_cot_examples[0])

{'problem': 'Write a function to find the longest chain which can be formed from the given set of pairs.', 'test_case': ['assert max_chain_length([Pair(5, 24), Pair(15, 25),Pair(27, 40), Pair(50, 60)], 4) == 3', 'assert max_chain_length([Pair(1, 2), Pair(3, 4),Pair(5, 6), Pair(7, 8)], 4) == 4', 'assert max_chain_length([Pair(19, 10), Pair(11, 12),Pair(13, 14), Pair(15, 16), Pair(31, 54)], 5) == 5'], 'solution_code': 'class Pair(object): \r\n\tdef __init__(self, a, b): \r\n\t\tself.a = a \r\n\t\tself.b = b \r\ndef max_chain_length(arr, n): \r\n\tmax = 0\r\n\tmcl = [1 for i in range(n)] \r\n\tfor i in range(1, n): \r\n\t\tfor j in range(0, i): \r\n\t\t\tif (arr[i].a > arr[j].b and\r\n\t\t\t\tmcl[i] < mcl[j] + 1): \r\n\t\t\t\tmcl[i] = mcl[j] + 1\r\n\tfor i in range(n): \r\n\t\tif (max < mcl[i]): \r\n\t\t\tmax = mcl[i] \r\n\treturn max', 'solution_cot': "Greedy\n\n1. Input/output: Given a list of n pairs of integers [(a1, b1), (a2, b2), ..., (an, bn)], return an integer representing the le

In [27]:
clear_gpu_memory()

# Fine-tune the student model
code_student_model.load_state_dict(code_student_initial_state)
trained_code_student_model = start_fine_tuning(code_student_model, tokenizer, train_code_examples)

2025-04-19 20:47:08,039 - INFO - Starting training the student model for 6 epochs


Fine-Tuning code on Student model...


Epoch 1/6: 100%|██████████| 38/38 [00:41<00:00,  1.09s/it, loss=0.0831]
2025-04-19 20:47:49,367 - INFO - Epoch 1/6 - Average loss: 0.5465
Epoch 2/6: 100%|██████████| 38/38 [00:40<00:00,  1.07s/it, loss=0.049] 
2025-04-19 20:48:30,133 - INFO - Epoch 2/6 - Average loss: 0.0738
Epoch 3/6:  61%|██████    | 23/38 [00:26<00:16,  1.09s/it, loss=0.0494]2025-04-19 20:48:56,258 - INFO - Step 100: loss = 0.0494
Epoch 3/6: 100%|██████████| 38/38 [00:40<00:00,  1.07s/it, loss=0.0632]
2025-04-19 20:49:10,888 - INFO - Epoch 3/6 - Average loss: 0.0424
Epoch 4/6: 100%|██████████| 38/38 [00:40<00:00,  1.07s/it, loss=0.0109]
2025-04-19 20:49:51,639 - INFO - Epoch 4/6 - Average loss: 0.0185
Epoch 5/6: 100%|██████████| 38/38 [00:40<00:00,  1.07s/it, loss=0.00618]
2025-04-19 20:50:32,381 - INFO - Epoch 5/6 - Average loss: 0.0081
Epoch 6/6:  24%|██▎       | 9/38 [00:10<00:31,  1.09s/it, loss=0.00406]2025-04-19 20:50:43,265 - INFO - Step 200: loss = 0.0041
Epoch 6/6: 100%|██████████| 38/38 [00:40<00:00,  1.07

In [28]:
# # in case of disk/memory filling, this reloads the trained model from files

# trained_student_path = f"results/student_model_{SOLUTION_FIELD}_final"
# trained_student_model = AutoModelForCausalLM.from_pretrained(trained_student_path).to(device)
# trained_tokenizer = AutoTokenizer.from_pretrained(trained_student_path)

In [31]:
# Generate Code examples using Trained Student model
trained_code_examples = generate_trained_model_examples(trained_cot_examples, trained_code_student_model, tokenizer)

# print("Evaluating CoT student model...")
# evaluation_results = evaluate_student_model(
#     student_model=student_model,
#     student_tokenizer=student_tokenizer,
#     test_problems=test_problems,
#     teacher_model=teacher_model,
#     batch_size=BATCH_SIZE,
#     max_length=GENERATED_TOKEN_LEN,
#     output_dir="results/evaluations"
# )

del code_teacher_model, tokenizer
del code_student_model, code_student_tokenizer
torch.cuda.empty_cache()  # Clear CUDA cache

2025-04-19 20:52:18,501 - INFO - Generating code with student for 51 problems...


Generating code examples using Trained Student model...


Generating code:   2%|▏         | 1/51 [00:02<02:13,  2.68s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): ```python
def remove_char(s, c):
    if not s or c not in s:
        return s
    
    n = len(s)
    prefix_sum = [0] * (n + 1)
    
    for i in ran...


Generating code:   4%|▍         | 2/51 [00:04<01:58,  2.41s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): ```python
def sort_matrix_by_row_sum(matrix):
    if not matrix:
        return []
    
    n = len(matrix)
    result = [[0] * n for _ in range(n)]
 ...


Generating code:  18%|█▊        | 9/51 [00:18<01:03,  1.51s/it]2025-04-19 20:52:37,332 - INFO - Generated 10/51 solutions
Generating code:  37%|███▋      | 19/51 [00:33<00:57,  1.79s/it]2025-04-19 20:52:55,988 - INFO - Generated 20/51 solutions
Generating code:  57%|█████▋    | 29/51 [01:01<00:57,  2.61s/it]2025-04-19 20:53:20,990 - INFO - Generated 30/51 solutions
Generating code:  76%|███████▋  | 39/51 [01:11<00:11,  1.06it/s]2025-04-19 20:53:35,685 - INFO - Generated 40/51 solutions
Generating code:  96%|█████████▌| 49/51 [01:29<00:02,  1.02s/it]2025-04-19 20:53:50,449 - INFO - Generated 50/51 solutions
Generating code: 100%|██████████| 51/51 [01:33<00:00,  1.83s/it]
2025-04-19 20:53:52,029 - INFO - Successfully generated 51 code solutions
2025-04-19 20:53:52,034 - INFO - Dataset saved to dataset/code_student_51_dataset.json


## Debugger Agent

In [32]:
clear_gpu_memory()

# CoT Agent Params
PROMPT_TEMPLATE = DEBUGGER_PROMPT_TEMPLATE
SOLUTION_FIELD = "debugged"
OUTPUT_MARKER = "Debugged Python code:"
GENERATED_TOKEN_LEN = 512
TEMPERATURE=0.7

print("Loading Coder models...")
code_teacher_model, tokenizer, code_student_model, code_student_tokenizer = load_models("Qwen/Qwen2.5-Coder-7B-Instruct", "Qwen/Qwen2.5-Coder-0.5B-Instruct")
code_student_initial_state = {k: v.detach().clone() for k, v in code_student_model.state_dict().items()}

2025-04-19 20:54:37,144 - INFO - Loading teacher model: Qwen/Qwen2.5-Coder-7B-Instruct


Loading Coder models...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-19 20:54:37,549 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-04-19 20:54:48,230 - INFO - Teacher model loaded successfully
2025-04-19 20:54:48,231 - INFO - Loading student model: Qwen/Qwen2.5-Coder-0.5B-Instruct
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-19 20:54:48,538 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2025-04-19 20:54:49,249 - INFO - Student model loaded successfully


In [33]:
clear_gpu_memory()

code_student_model.load_state_dict(code_student_initial_state)
train_debug_examples, untrained_debug_examples = generate_base_model_examples(
    train_code_examples,
    trained_code_examples,
    code_teacher_model,
    code_student_model,
    tokenizer
)

2025-04-19 20:54:49,361 - INFO - Generating debugged with teacher for 374 problems...


Generating debugged examples using Teacher model...


Generating debugged:   0%|          | 1/374 [00:02<13:55,  2.24s/it]


Example 1:
Problem: Write a function to find the longest chain which can be formed from the given set of pairs....
Solution (first 150 chars): ```python
def findLongestChain(pairs):
    if not pairs:
        return 0
    
    # Sort pairs by their first element
    pairs.sort(key=lambda x: x[...


Generating debugged:   1%|          | 2/374 [00:03<09:58,  1.61s/it]


Example 2:
Problem: Write a python function to find the first repeated character in a given string....
Solution (first 150 chars): ```python
def first_repeated_char(s):
    char_dict = set()
    for char in s:
        if char in char_dict:
            return char
        char_dict...


Generating debugged:   2%|▏         | 9/374 [00:19<16:24,  2.70s/it]2025-04-19 20:55:09,542 - INFO - Generated 10/374 solutions
Generating debugged:   5%|▌         | 19/374 [00:44<15:55,  2.69s/it]2025-04-19 20:55:37,416 - INFO - Generated 20/374 solutions
Generating debugged:   8%|▊         | 29/374 [01:11<13:56,  2.42s/it]2025-04-19 20:56:03,170 - INFO - Generated 30/374 solutions
Generating debugged:  10%|█         | 39/374 [01:37<16:03,  2.88s/it]2025-04-19 20:56:28,020 - INFO - Generated 40/374 solutions
Generating debugged:  13%|█▎        | 49/374 [02:05<16:43,  3.09s/it]2025-04-19 20:56:55,872 - INFO - Generated 50/374 solutions
Generating debugged:  16%|█▌        | 59/374 [02:28<10:42,  2.04s/it]2025-04-19 20:57:19,444 - INFO - Generated 60/374 solutions
Generating debugged:  18%|█▊        | 69/374 [02:52<11:54,  2.34s/it]2025-04-19 20:57:42,301 - INFO - Generated 70/374 solutions
Generating debugged:  21%|██        | 79/374 [03:05<07:10,  1.46s/it]2025-04-19 20:57:56,587 - INF

Generating debugged examples using untrained Student model...


Generating debugged:   2%|▏         | 1/50 [00:05<04:52,  5.97s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): ```python
def remove_char(s, c):
    if not s or c not in s:
        return s
    
    n = len(s)
    prefix_sum = [0] * (n + 1)
    
    for i in ran...


Generating debugged:   4%|▍         | 2/50 [00:10<04:02,  5.05s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): ```python
def sort_matrix_by_row_sum(matrix):
    if not matrix:
        return []
    
    n = len(matrix)
    result = [[0] * n for _ in range(n)]
 ...


Generating debugged:  18%|█▊        | 9/50 [00:51<03:21,  4.92s/it]2025-04-19 21:10:59,494 - INFO - Generated 10/50 solutions
Generating debugged:  38%|███▊      | 19/50 [01:55<02:58,  5.77s/it]2025-04-19 21:11:59,278 - INFO - Generated 20/50 solutions
Generating debugged:  58%|█████▊    | 29/50 [03:00<02:16,  6.50s/it]2025-04-19 21:13:03,809 - INFO - Generated 30/50 solutions
Generating debugged:  78%|███████▊  | 39/50 [03:58<00:56,  5.15s/it]2025-04-19 21:14:03,557 - INFO - Generated 40/50 solutions
Generating debugged:  98%|█████████▊| 49/50 [04:58<00:05,  5.16s/it]2025-04-19 21:15:01,337 - INFO - Generated 50/50 solutions
Generating debugged: 100%|██████████| 50/50 [05:02<00:00,  6.05s/it]
2025-04-19 21:15:01,338 - INFO - Successfully generated 50 debugged solutions
2025-04-19 21:15:01,344 - INFO - Dataset saved to dataset/debugged_student_50_dataset.json


In [34]:
# # in case of disk/memory filling, this reloads the examples from json

# clear_gpu_memory()

# train_debug_examples_file = open(f"dataset/{SOLUTION_FIELD}_teacher_{TEACHER_EXAMPLE_LEN}_dataset.json")
# train_debug_examples = json.load(train_debug_examples_file)

# print(train_debug_examples[0])

# trained_code_examples_file = open(f"dataset/code_student_{STUDENT_EXAMPLE_LEN+1}_dataset.json")
# trained_code_examples = json.load(trained_code_examples_file)

# print(trained_code_examples[0])

In [35]:
clear_gpu_memory()

# Fine-tune the student model
code_student_model.load_state_dict(code_student_initial_state)
trained_debug_student_model = start_fine_tuning(code_student_model, tokenizer, train_debug_examples)

2025-04-19 21:15:01,480 - INFO - Starting training the student model for 6 epochs


Fine-Tuning debugged on Student model...


Epoch 1/6: 100%|██████████| 38/38 [00:40<00:00,  1.07s/it, loss=0.0512]
2025-04-19 21:15:42,229 - INFO - Epoch 1/6 - Average loss: 0.4451
Epoch 2/6: 100%|██████████| 38/38 [00:40<00:00,  1.07s/it, loss=0.0163] 
2025-04-19 21:16:22,945 - INFO - Epoch 2/6 - Average loss: 0.0184
Epoch 3/6:  61%|██████    | 23/38 [00:26<00:16,  1.09s/it, loss=0.00528]2025-04-19 21:16:49,037 - INFO - Step 100: loss = 0.0053
Epoch 3/6: 100%|██████████| 38/38 [00:40<00:00,  1.07s/it, loss=0.00666]
2025-04-19 21:17:03,656 - INFO - Epoch 3/6 - Average loss: 0.0112
Epoch 4/6: 100%|██████████| 38/38 [00:40<00:00,  1.07s/it, loss=0.000419]
2025-04-19 21:17:44,359 - INFO - Epoch 4/6 - Average loss: 0.0051
Epoch 5/6: 100%|██████████| 38/38 [00:40<00:00,  1.07s/it, loss=0.00175] 
2025-04-19 21:18:25,059 - INFO - Epoch 5/6 - Average loss: 0.0025
Epoch 6/6:  24%|██▎       | 9/38 [00:10<00:31,  1.09s/it, loss=0.000824]2025-04-19 21:18:35,929 - INFO - Step 200: loss = 0.0008
Epoch 6/6: 100%|██████████| 38/38 [00:40<00:00

In [36]:
# # in case of disk/memory filling, this reloads the trained model from files

# trained_student_path = f"results/student_model_{SOLUTION_FIELD}_final"
# trained_student_model = AutoModelForCausalLM.from_pretrained(trained_student_path).to(device)
# trained_tokenizer = AutoTokenizer.from_pretrained(trained_student_path)

In [37]:
# Generate debug examples using Trained Student model...
trained_debug_examples = generate_trained_model_examples(trained_code_examples, trained_debug_student_model, tokenizer)

# print("Evaluating CoT student model...")
# evaluation_results = evaluate_student_model(
#     student_model=student_model,
#     student_tokenizer=student_tokenizer,
#     test_problems=test_problems,
#     teacher_model=teacher_model,
#     batch_size=BATCH_SIZE,
#     max_length=GENERATED_TOKEN_LEN,
#     output_dir="results/evaluations"
# )

del code_teacher_model, tokenizer
del code_student_model, code_student_tokenizer
torch.cuda.empty_cache()  # Clear CUDA cache

2025-04-19 21:19:10,216 - INFO - Generating debugged with student for 51 problems...


Generating debugged examples using Trained Student model...


Generating debugged:   2%|▏         | 1/51 [00:02<02:12,  2.65s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): ```python
def remove_char(s, c):
    if not s or c not in s:
        return s
    
    n = len(s)
    prefix_sum = [0] * (n + 1)
    
    for i in ran...


Generating debugged:   4%|▍         | 2/51 [00:04<01:38,  2.01s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): ```python
def sort_matrix_by_row_sum(matrix):
    if not matrix:
        return []
    
    n = len(matrix)
    result = [[0] * n for _ in range(n)]
 ...


Generating debugged:  18%|█▊        | 9/51 [00:17<01:03,  1.51s/it]2025-04-19 21:19:28,461 - INFO - Generated 10/51 solutions
Generating debugged:  37%|███▋      | 19/51 [00:30<00:50,  1.57s/it]2025-04-19 21:19:44,497 - INFO - Generated 20/51 solutions
Generating debugged:  57%|█████▋    | 29/51 [00:56<00:53,  2.44s/it]2025-04-19 21:20:07,553 - INFO - Generated 30/51 solutions
Generating debugged:  76%|███████▋  | 39/51 [01:05<00:11,  1.08it/s]2025-04-19 21:20:21,553 - INFO - Generated 40/51 solutions
Generating debugged:  96%|█████████▌| 49/51 [01:22<00:01,  1.03it/s]2025-04-19 21:20:35,149 - INFO - Generated 50/51 solutions
Generating debugged: 100%|██████████| 51/51 [01:26<00:00,  1.70s/it]
2025-04-19 21:20:36,749 - INFO - Successfully generated 51 debugged solutions
2025-04-19 21:20:36,755 - INFO - Dataset saved to dataset/debugged_student_51_dataset.json


## Explainer Agent

In [24]:
clear_gpu_memory()

# CoT Agent Params
PROMPT_TEMPLATE = EXPLAINER_PROMPT_TEMPLATE
SOLUTION_FIELD = "explanation"
OUTPUT_MARKER = "Python code explanation:"
GENERATED_TOKEN_LEN = 512
TEMPERATURE=0.7

print("Loading Instruct models...")
teacher_model, tokenizer, student_model, student_tokenizer = load_models("Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2.5-0.5B-Instruct")
student_initial_state = {k: v.detach().clone() for k, v in student_model.state_dict().items()}

2025-04-20 09:46:39,963 - INFO - Loading teacher model: Qwen/Qwen2.5-7B-Instruct


Loading Instruct models...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-20 09:46:43,243 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-04-20 09:46:57,162 - INFO - Teacher model loaded successfully
2025-04-20 09:46:57,162 - INFO - Loading student model: Qwen/Qwen2.5-0.5B-Instruct
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-04-20 09:46:57,424 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2025-04-20 09:46:58,420 - INFO - Student model loaded successfully


In [39]:
clear_gpu_memory()

#TODO: concat debug code examples to the train code examples(?)
student_model.load_state_dict(student_initial_state)
train_explain_examples, untrained_explain_examples = generate_base_model_examples(
    train_code_examples,
    trained_code_examples,
    teacher_model,
    student_model,
    tokenizer
)

2025-04-19 21:22:55,635 - INFO - Generating explanation with teacher for 374 problems...


Generating explanation examples using Teacher model...


Generating explanation:   0%|          | 1/374 [00:11<1:09:06, 11.12s/it]


Example 1:
Problem: Write a function to find the longest chain which can be formed from the given set of pairs....
Solution (first 150 chars): This Python function solves the problem of finding the longest chain that can be formed from a given set of pairs. It uses a greedy algorithm approach...


Generating explanation:   1%|          | 2/374 [00:22<1:10:13, 11.33s/it]


Example 2:
Problem: Write a python function to find the first repeated character in a given string....
Solution (first 150 chars): The code defines a Python function called `first_repeated_char` which takes a string `s` as input. It aims to find and return the first character that...


Generating explanation:   2%|▏         | 9/374 [01:37<1:06:29, 10.93s/it]2025-04-19 21:24:45,641 - INFO - Generated 10/374 solutions
Generating explanation:   5%|▌         | 19/374 [03:27<1:08:54, 11.65s/it]2025-04-19 21:26:35,444 - INFO - Generated 20/374 solutions
Generating explanation:   8%|▊         | 29/374 [05:04<53:43,  9.34s/it]  2025-04-19 21:28:07,961 - INFO - Generated 30/374 solutions
Generating explanation:  10%|█         | 39/374 [06:44<59:21, 10.63s/it]  2025-04-19 21:29:52,675 - INFO - Generated 40/374 solutions
Generating explanation:  13%|█▎        | 49/374 [08:28<49:57,  9.22s/it]  2025-04-19 21:31:35,410 - INFO - Generated 50/374 solutions
Generating explanation:  16%|█▌        | 59/374 [09:59<50:36,  9.64s/it]2025-04-19 21:33:05,212 - INFO - Generated 60/374 solutions
Generating explanation:  18%|█▊        | 69/374 [11:36<49:27,  9.73s/it]2025-04-19 21:34:39,757 - INFO - Generated 70/374 solutions
Generating explanation:  21%|██        | 79/374 [13:11<45:16,  9.21

Generating explanation examples using untrained Student model...


Generating explanation:   2%|▏         | 1/50 [00:04<03:58,  4.87s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): The Python function `remove_char` takes two arguments: `s`, which is the input string, and `c`, which is the character to be removed. It returns a new...


Generating explanation:   4%|▍         | 2/50 [00:12<05:10,  6.47s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): The provided Python function `sort_matrix_by_row_sum` is designed to sort an input list or 2D numpy array based on the total value of elements in each...


Generating explanation:  18%|█▊        | 9/50 [00:53<03:39,  5.36s/it]2025-04-19 22:26:41,125 - INFO - Generated 10/50 solutions
Generating explanation:  38%|███▊      | 19/50 [01:50<02:39,  5.13s/it]2025-04-19 22:27:39,666 - INFO - Generated 20/50 solutions
Generating explanation:  58%|█████▊    | 29/50 [02:43<01:51,  5.31s/it]2025-04-19 22:28:28,335 - INFO - Generated 30/50 solutions
Generating explanation:  78%|███████▊  | 39/50 [03:50<01:10,  6.39s/it]2025-04-19 22:29:35,137 - INFO - Generated 40/50 solutions
Generating explanation:  98%|█████████▊| 49/50 [04:48<00:06,  6.67s/it]2025-04-19 22:30:35,163 - INFO - Generated 50/50 solutions
Generating explanation: 100%|██████████| 50/50 [04:54<00:00,  5.89s/it]
2025-04-19 22:30:35,164 - INFO - Successfully generated 50 explanation solutions
2025-04-19 22:30:35,172 - INFO - Dataset saved to dataset/explanation_student_50_dataset.json


In [26]:
# # in case of disk/memory filling, this reloads the examples from json

# clear_gpu_memory()

# train_explain_examples_file = open(f"dataset/{SOLUTION_FIELD}_teacher_{TEACHER_EXAMPLE_LEN}_dataset.json")
# train_explain_examples = json.load(train_explain_examples_file)

# print(train_explain_examples[0])

# trained_code_examples_file = open(f"dataset/code_student_{STUDENT_EXAMPLE_LEN+1}_dataset.json")
# trained_code_examples = json.load(trained_code_examples_file)

# print(trained_code_examples[0])

{'problem': 'Write a function to find the longest chain which can be formed from the given set of pairs.', 'test_case': ['assert max_chain_length([Pair(5, 24), Pair(15, 25),Pair(27, 40), Pair(50, 60)], 4) == 3', 'assert max_chain_length([Pair(1, 2), Pair(3, 4),Pair(5, 6), Pair(7, 8)], 4) == 4', 'assert max_chain_length([Pair(19, 10), Pair(11, 12),Pair(13, 14), Pair(15, 16), Pair(31, 54)], 5) == 5'], 'solution_code': 'class Pair(object): \r\n\tdef __init__(self, a, b): \r\n\t\tself.a = a \r\n\t\tself.b = b \r\ndef max_chain_length(arr, n): \r\n\tmax = 0\r\n\tmcl = [1 for i in range(n)] \r\n\tfor i in range(1, n): \r\n\t\tfor j in range(0, i): \r\n\t\t\tif (arr[i].a > arr[j].b and\r\n\t\t\t\tmcl[i] < mcl[j] + 1): \r\n\t\t\t\tmcl[i] = mcl[j] + 1\r\n\tfor i in range(n): \r\n\t\tif (max < mcl[i]): \r\n\t\t\tmax = mcl[i] \r\n\treturn max', 'solution_cot': "Greedy\n\n1. Input/output: Given a list of n pairs of integers [(a1, b1), (a2, b2), ..., (an, bn)], return an integer representing the le

In [27]:
clear_gpu_memory()

# Fine-tune the student model
student_model.load_state_dict(student_initial_state)
trained_explain_student_model = start_fine_tuning(student_model, tokenizer, train_explain_examples)

2025-04-20 09:49:22,180 - INFO - Starting training the student model for 6 epochs


Fine-Tuning explanation on Student model...


Epoch 1/6: 100%|██████████| 38/38 [00:41<00:00,  1.10s/it, loss=1.04] 
2025-04-20 09:50:03,901 - INFO - Epoch 1/6 - Average loss: 1.1583
Epoch 2/6: 100%|██████████| 38/38 [00:41<00:00,  1.08s/it, loss=0.791]
2025-04-20 09:50:44,973 - INFO - Epoch 2/6 - Average loss: 0.6859
Epoch 3/6:  61%|██████    | 23/38 [00:26<00:16,  1.10s/it, loss=0.351]2025-04-20 09:51:11,280 - INFO - Step 100: loss = 0.3514
Epoch 3/6: 100%|██████████| 38/38 [00:41<00:00,  1.08s/it, loss=0.286]
2025-04-20 09:51:26,045 - INFO - Epoch 3/6 - Average loss: 0.3454
Epoch 4/6: 100%|██████████| 38/38 [00:41<00:00,  1.08s/it, loss=0.0932]
2025-04-20 09:52:07,062 - INFO - Epoch 4/6 - Average loss: 0.1563
Epoch 5/6: 100%|██████████| 38/38 [00:41<00:00,  1.08s/it, loss=0.0517]
2025-04-20 09:52:48,090 - INFO - Epoch 5/6 - Average loss: 0.0658
Epoch 6/6:  24%|██▎       | 9/38 [00:10<00:31,  1.09s/it, loss=0.0397]2025-04-20 09:52:59,034 - INFO - Step 200: loss = 0.0397
Epoch 6/6: 100%|██████████| 38/38 [00:40<00:00,  1.08s/it, 

In [28]:
# # in case of disk/memory filling, this reloads the trained model from files

# trained_student_path = f"results/student_model_{SOLUTION_FIELD}_final"
# trained_student_model = AutoModelForCausalLM.from_pretrained(trained_student_path).to(device)
# trained_tokenizer = AutoTokenizer.from_pretrained(trained_student_path)

In [45]:
# Generate explanation examples using Trained Student model
trained_explain_examples = generate_trained_model_examples(trained_code_examples, trained_explain_student_model, tokenizer)

# print("Evaluating CoT student model...")
# evaluation_results = evaluate_student_model(
#     student_model=student_model,
#     student_tokenizer=student_tokenizer,
#     test_problems=test_problems,
#     teacher_model=teacher_model,
#     batch_size=BATCH_SIZE,
#     max_length=GENERATED_TOKEN_LEN,
#     output_dir="results/evaluations"
# )

del teacher_model, tokenizer
del student_model, student_tokenizer
torch.cuda.empty_cache()  # Clear CUDA cache

2025-04-20 10:06:40,801 - INFO - Generating explanation with student for 51 problems...


Generating explanation examples using Trained Student model...


Generating explanation:   2%|▏         | 1/51 [00:09<07:37,  9.14s/it]


Example 1:
Problem: Write a python function to remove first and last occurrence of a given character from the string....
Solution (first 150 chars): This Python function `remove_char` removes all occurrences of a specified character from a given string. It uses a prefix sum array to efficiently cal...


Generating explanation:   4%|▍         | 2/51 [00:18<07:24,  9.07s/it]


Example 2:
Problem: Write a function to sort a given matrix in ascending order according to the sum of its rows....
Solution (first 150 chars): This Python function sorts a given matrix (list of lists) based on the sum of its rows. Instead of sorting directly, which would involve comparing eve...


Generating explanation:  18%|█▊        | 9/51 [01:06<05:10,  7.39s/it]2025-04-20 10:07:55,052 - INFO - Generated 10/51 solutions
Generating explanation:  37%|███▋      | 19/51 [02:24<04:11,  7.87s/it]2025-04-20 10:09:09,804 - INFO - Generated 20/51 solutions
Generating explanation:  57%|█████▋    | 29/51 [03:34<02:41,  7.34s/it]2025-04-20 10:10:18,739 - INFO - Generated 30/51 solutions
Generating explanation:  76%|███████▋  | 39/51 [04:41<01:16,  6.38s/it]2025-04-20 10:11:27,776 - INFO - Generated 40/51 solutions
Generating explanation:  96%|█████████▌| 49/51 [05:48<00:14,  7.31s/it]2025-04-20 10:12:35,384 - INFO - Generated 50/51 solutions
Generating explanation: 100%|██████████| 51/51 [06:03<00:00,  7.13s/it]
2025-04-20 10:12:44,412 - INFO - Successfully generated 51 explanation solutions
2025-04-20 10:12:44,424 - INFO - Dataset saved to dataset/explanation_student_51_dataset.json


# Extras

In [None]:
def extract_problem_description(source_code):
    """
    Extracts the problem description from the first docstring in the source code,
    whether it's enclosed in triple double quotes or triple single quotes.
    """
    docstring_pattern = re.compile(r'("""|\'\'\')(.*?)(\1)', re.DOTALL)
    match = docstring_pattern.search(source_code)

    if match:
        description = match.group(2)
        # Clean up leading/trailing whitespace on each line
        cleaned_lines = [line.strip() for line in description.strip().splitlines() if line.strip()]
        return ' '.join(cleaned_lines)

    raise Exception(f"Error: Unable to extract problem description. Please check the format of the prompt:\n{source_code}")
    return None

def extract_code_header(source_code):
    """
    Extracts everything from the beginning of the source code up to
    the first occurrence of either triple single quotes or triple double quotes.
    """
    # Match from start of string to the first triple quotes (single or double)
    docstring_pattern = re.compile(r'^(.*?)(?="""|\'\'\')', re.DOTALL)
    match = docstring_pattern.search(source_code)

    if match:
        header = match.group(1)
        # Clean up leading/trailing whitespace on each line
        cleaned_lines = [line.strip() for line in header.strip().splitlines() if line.strip()]
        return ' '.join(cleaned_lines)
    raise Exception(f"Error: Unable to extract code header. Please check the format of the prompt:\n{source_code}")
    return None

def load_human_eval_dataset():
    human_eval = load_dataset("openai_humaneval")

    train_problems = []
    # Extract problems from the MBPP dataset with correct field names
    for item in human_eval["test"]:
        train_problems.append({
            "problem": extract_problem_description(item["prompt"]),
            "code_header": extract_code_header(item["prompt"]),
            "test_case": item["prompt"],
            "solution_code": item["prompt"] + item["canonical_solution"]
        })
    return train_problems

COT_PROMPT_TEMPLATE = """Generate a detailed step-by-step solution for this coding problem.
Break down your thought process clearly, explaining your reasoning while considering:
- What are the inputs and outputs of the function?
- What algorithm or data structure is most appropriate?
- Are there any edge cases to handle?
- What's the efficiency of your approach?

Be concise in your explanation.

Problem:
{problem}

Step-by-step solution:"""

# CODER_PROMPT_TEMPLATE = """Generate only a markdown code block that contains clean, efficient
# Python code for this coding problem based on the solution approach. The code block must start
# with ```python on its own line, then the code, and end with ``` on its own line.
# Focus on:
# - Implementing the key algorithmic insights
# - Handling edge cases identified in the solution
# - Maintaining readability and efficiency
# Do not include:
# - test cases
# - extra code explanation

# Step-by-step solution:
# {cot_solution}

# Python code:
# {code_header}"""

CODER_PROMPT_TEMPLATE = """Generate only a markdown code block that contains clean, efficient
Python code for this coding problem based on the solution approach. The code block must start
with ```python on its own line, then the code, and end with ``` on its own line. Do not include
test cases or code explanations.
Focus on:
- Implementing the key algorithmic insights
- Handling edge cases identified in the solution
- Maintaining readability and efficiency

Step-by-step solution:
{cot_solution}

Python code:
{code_header}"""


human_eval = load_human_eval_dataset()
print(human_eval[0])

print("loaded dataset")

trained_cot_student_path = f"results/student_model_cot_solution_final"
trained_cot_student_model = AutoModelForCausalLM.from_pretrained(trained_cot_student_path).to(device)
trained_cot_tokenizer = AutoTokenizer.from_pretrained(trained_cot_student_path)

untrained_coder_model_name = "Qwen/Qwen2.5-Coder-0.5B"
untrained_coder_tokenizer = AutoTokenizer.from_pretrained(untrained_coder_model_name)
untrained_coder_model = AutoModelForCausalLM.from_pretrained(
    untrained_coder_model_name,
    device_map="auto",
    torch_dtype=torch.float32
)

print("Loaded models")

trained_cot_examples = generate_dataset(
    human_eval,
    COT_PROMPT_TEMPLATE,
    "cot_solution",
    "Step-by-step solution:",
    trained_cot_student_model,
    trained_cot_tokenizer,
    num_examples=100,
    max_new_tokens=512,
    teacher=False
)

print("cot examples generated")

code_examples = generate_dataset(
    trained_cot_examples,
    CODER_PROMPT_TEMPLATE,
    "gen_code",
    "Python code:",
    untrained_coder_model,
    untrained_coder_tokenizer,
    num_examples=100,
    max_new_tokens=512,
    teacher=False
)

print("code generated")



In [None]:
i = 22

#print('\nproblem:')
#print(new_code_examples[i]['problem'])
#print('\ncot')
#print(new_code_examples[i]['cot_solution'])
#print('\ngenerated_code')
print(new_code_examples[i]['gen_code'])

In [None]:
def extract_before_def(source_code):
    """
    Extracts everything from the beginning of the source code up to
    but not including the first occurrence of the 'def' keyword.
    Preserves original formatting.
    """
    pattern = re.compile(r'^(.*?)(?=def)', re.DOTALL)
    match = pattern.search(source_code)

    if match:
        return match.group(1)
    raise Exception(f"Error: Unable to extract content before 'def'. No 'def' keyword found in:\n{source_code}")
    return None

def extract_until_code_block(source_code):
    """
    Extracts everything from the beginning of the string up to
    but not including the first occurrence of three backticks (```).
    Preserves original formatting.
    """
    pattern = re.compile(r'^(.*?)(?=```)', re.DOTALL)
    match = pattern.search(source_code)

    if match:
        return match.group(1)
    return 'BAD'

solutions = [item['solution_code'] for item in new_code_examples]
generated_codes = [item['gen_code'] for item in new_code_examples]
for i, generated_code in enumerate(generated_codes):
    generated_codes[i] = extract_before_def(solutions[i]) + extract_until_code_block(generated_codes[i])
print(generated_codes[0])

def remove_bad_strings(string_array):
    """
    Removes any strings containing 'BAD' from the given array.
    Also prints the indices of removed strings.

    Args:
        string_array: A list of strings to filter

    Returns:
        A new list with all strings containing 'BAD' removed
    """
    bad_indices = []
    clean_strings = []

    for i, s in enumerate(string_array):
        if 'BAD' in s:
            bad_indices.append(i)
        else:
            clean_strings.append(s)

    # Print the indices of bad strings
    if bad_indices:
        print(f"Found 'BAD' in strings at indices: {bad_indices}")
    else:
        print("No strings containing 'BAD' found.")

    return clean_strings, bad_indices

edited_codes, bad_indices = remove_bad_strings(generated_codes)
print(len(edited_codes))

In [None]:
human_eval['test']

In [None]:
%pip install evaluate

from evaluate import load

# Load evaluation metric
code_eval = load("code_eval")

import os
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

problems = human_eval
test = []
for i, item in enumerate(edited_codes):
    edited_codes[i] = [item]
pred = edited_codes
c = 0

for i, s in enumerate(human_eval[:100]):
    if i not in bad_indices:
        test.append(s)
        c = c+1
        print(c)

pass_at_k = code_eval.compute(
        predictions=pred,
        references=test,
        k=[1]
)
print(pass_at_k)
print(pass_at_k[0]['pass@1']*100)

In [None]:
# Generate CoT (Chain of Thought) dataset
cot_examples = generate_dataset(
    problem_dataset=mbpp_problems,
    task_prompt=COT_PROMPT_TEMPLATE,
    solution_field="solution_cot",
    output_marker="Step-by-step solution:",
    teacher_model=teacher_model,
    teacher_tokenizer=teacher_tokenizer,
    num_examples=50,
    output_file="datasets/cot_dataset.json"
)

# Generate code dataset from CoT
code_examples = generate_dataset(
    problem_dataset=cot_examples,  # Use the output from CoT as input
    task_prompt=DEVELOPER_PROMPT_TEMPLATE,
    solution_field="code",
    output_marker="Python code:",
    teacher_model=teacher_model,
    teacher_tokenizer=teacher_tokenizer,
    num_examples=50,
    output_file="datasets/code_dataset.json"
)

# Generate debugged code dataset
debugged_examples = generate_dataset(
    problem_dataset=code_examples,  # Use the code examples as input
    task_prompt=DEBUGGER_PROMPT_TEMPLATE,
    solution_field="debugged_code",
    output_marker="Debugged Python code:",
    teacher_model=teacher_model,
    teacher_tokenizer=teacher_tokenizer,
    num_examples=50,
    output_file="datasets/debugged_code_dataset.json"
)

# Generate code explanations
explanation_examples = generate_dataset(
    problem_dataset=code_examples,  # Use code examples that also have CoT
    task_prompt=EXPLAINER_PROMPT_TEMPLATE,
    solution_field="explanation",
    output_marker="Explanation of the code:",
    teacher_model=teacher_model,
    teacher_tokenizer=teacher_tokenizer,
    num_examples=50,
    output_file="datasets/explanation_dataset.json"
)

In [None]:
for i, example in enumerate(mbpp_problems):
  print(f"Problem number: {i}")
  print(f"Problem: {example['problem']}")
  print("Test cases:")
  print(example['test_case'])
  print("Code Solution:")
  print(example['solution'])