In [1]:
# Math Problem Solving with Transformers: From Pre-trained Models to Fine-tuning

import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re

# For transformer models
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM,
    GPT2Tokenizer, GPT2LMHeadModel,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)

# For evaluation
from sklearn.metrics import accuracy_score
from datasets import Dataset as HFDataset

# For fine-tuning
try:
    from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("PEFT library not available. Fine-tuning with LoRA will be skipped.")

# Set random seeds for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def generate_math_problem(difficulty='easy'):
    """Generate a simple math problem based on difficulty level."""
    if difficulty == 'easy':
        a, b = random.randint(1, 10), random.randint(1, 10)
        operation = random.choice(['+', '-'])
    elif difficulty == 'medium':
        a, b = random.randint(5, 20), random.randint(5, 20)
        operation = random.choice(['+', '-', '*'])
    else:  # 'hard'
        a, b = random.randint(10, 30), random.randint(2, 10)
        operation = random.choice(['+', '-', '*', '/'])
        if operation == '/':  # Ensure clean division
            a = b * random.randint(1, 10)
    
    problem = f"{a} {operation} {b}"
    
    # Calculate solution
    if operation == '+':
        solution = a + b
        work = f"To solve {a} + {b}, I add {a} and {b} to get {solution}."
    elif operation == '-':
        solution = a - b
        work = f"To solve {a} - {b}, I subtract {b} from {a} to get {solution}."
    elif operation == '*':
        solution = a * b
        work = f"To solve {a} * {b}, I multiply {a} by {b} to get {solution}."
    elif operation == '/':
        solution = a // b  # Integer division
        work = f"To solve {a} / {b}, I divide {a} by {b} to get {solution}."
    
    return {
        'problem': problem,
        'solution': float(solution),  # Convert to float for consistency
        'work': work,
        'difficulty': difficulty,
        'type': 'numeric'
    }

def generate_word_problem(difficulty='easy'):
    """Generate a word problem based on difficulty."""
    templates = {
        'easy': [
            "John has {a} apples and gets {b} more. How many apples does John have now?",
            "Sarah has {a} dollars and spends {b} dollars. How much money does she have left?"
        ],
        'medium': [
            "A store sold {a} items on Monday and {b} items on Tuesday. How many items were sold in total?",
            "A train travels at {a} miles per hour for {b} hours. How far does it travel?"
        ],
        'hard': [
            "A store has a {a}% discount on all items. If an item originally costs ${b}, what is the final price?",
            "A car uses {a} gallons of gas to travel {b} miles. At this rate, how many gallons will it use to travel {c} miles?"
        ]
    }
    
    template = random.choice(templates[difficulty])
    
    if difficulty == 'easy':
        a = random.randint(5, 20)
        b = random.randint(1, 10)
        
        if "more" in template:
            solution = a + b
            work = f"To solve this problem, I need to add {a} and {b}, which gives me {solution}."
        elif "spends" in template:
            solution = a - b
            work = f"To solve this problem, I need to subtract {b} from {a}, which gives me {solution}."
        
        problem = template.format(a=a, b=b)
        
    elif difficulty == 'medium':
        a = random.randint(10, 30)
        b = random.randint(2, 10)
        
        if "total" in template:
            solution = a + b
            work = f"To solve this problem, I need to add {a} and {b}, which gives me {solution}."
        elif "train" in template:
            solution = a * b
            work = f"To solve this problem, I need to multiply the speed ({a} mph) by the time ({b} hours), which gives me {solution} miles."
            
        problem = template.format(a=a, b=b)
        
    else:  # 'hard'
        a = random.randint(10, 40)  # discount percentage
        b = random.randint(50, 200)  # original price
        c = random.randint(100, 500)  # distance for car problem
        
        if "discount" in template:
            discounted_price = b * (100 - a) / 100
            solution = discounted_price
            work = f"To solve this problem, I need to calculate the discounted price as original price * (100% - discount%). So ${b} * (100% - {a}%) = ${b} * {100-a}% = ${solution}."
        elif "car" in template:
            gallons_per_mile = a / b
            solution = gallons_per_mile * c
            work = f"To solve this problem, I need to find the rate in gallons per mile: {a} gallons / {b} miles = {a/b:.4f} gallons per mile. Then I multiply by {c} miles to get {solution} gallons."
            
        problem = template.format(a=a, b=b, c=c)
    
    return {
        'problem': problem,
        'solution': float(solution),  # Convert to float for decimal solutions
        'work': work,
        'difficulty': difficulty,
        'type': 'word'
    }

def generate_dataset(n_samples=1000):
    """Generate a dataset of math problems with an equal mix of types and difficulties."""
    data = []
    difficulties = ['easy', 'medium', 'hard']
    
    # Generate numeric problems
    for i in range(n_samples // 2):
        difficulty = difficulties[i % 3]
        problem_data = generate_math_problem(difficulty)
        problem_data['id'] = i
        data.append(problem_data)
    
    # Generate word problems
    for i in range(n_samples // 2, n_samples):
        difficulty = difficulties[i % 3]
        problem_data = generate_word_problem(difficulty)
        problem_data['id'] = i
        data.append(problem_data)
    
    return pd.DataFrame(data)

# Generate our dataset
n_samples = 1200  # Smaller dataset for faster training
df = generate_dataset(n_samples=n_samples)
print(f"Generated {len(df)} problems")

# Split into train, validation, and test sets (70%, 15%, 15%)
train_size = int(0.7 * len(df))
val_size = int(0.15 * len(df))
test_size = len(df) - train_size - val_size

# Shuffle the dataset
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# Split the data
train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size+val_size:]

print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")


Using device: cpu
Generated 1200 problems
Train: 840, Validation: 180, Test: 180


In [2]:

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Format data for different training paradigms
def format_for_training(problem, solution, with_work=False, work=None):
    """Format the data for model training and inference."""
    if with_work and work is not None:
        return f"Problem: {problem}\nSolution: {work} The answer is {solution}."
    else:
        return f"Problem: {problem}\nSolution: The answer is {solution}."

def format_for_inference(problem):
    """Format the data for model inference."""
    return f"Problem: {problem}\nSolution:"

# Create formatted datasets
train_texts = [format_for_training(row['problem'], row['solution'], True, row['work']) 
               for _, row in train_df.iterrows()]
val_texts = [format_for_training(row['problem'], row['solution'], True, row['work']) 
             for _, row in val_df.iterrows()]
test_texts = [format_for_training(row['problem'], row['solution'], False) 
              for _, row in test_df.iterrows()]

# Create HuggingFace datasets
train_dataset = HFDataset.from_dict({"text": train_texts})
val_dataset = HFDataset.from_dict({"text": val_texts})
test_dataset = HFDataset.from_dict({"text": test_texts})

# Tokenize function for language modeling
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize all datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Create data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal language modeling, not masked language modeling
)

# Print an example
print("Example training text:")
print(train_texts[0])


Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Example training text:
Problem: A store has a 30% discount on all items. If an item originally costs $177, what is the final price?
Solution: To solve this problem, I need to calculate the discounted price as original price * (100% - discount%). So $177 * (100% - 30%) = $177 * 70% = $123.9. The answer is 123.9.


In [3]:

def extract_answer(text):
    """Extract the numerical answer from model output."""
    # Try to find "the answer is X" pattern
    match = re.search(r"the answer is ([-+]?\d*\.?\d+)", text.lower())
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            pass
    
    # Try to find "is X" pattern
    match = re.search(r"is ([-+]?\d*\.?\d+)", text.lower())
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            pass
    
    # Try to find any number in the text
    match = re.search(r"([-+]?\d*\.?\d+)", text)
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            pass
    
    return None

def evaluate_model_predictions(predictions, references):
    """Evaluate model predictions against references."""
    correct = 0
    extracted = 0
    
    for pred, ref in zip(predictions, references):
        extracted_answer = extract_answer(pred)
        if extracted_answer is not None:
            extracted += 1
            if abs(extracted_answer - ref) < 1e-5:  # Allow for floating point error
                correct += 1
    
    accuracy = correct / len(references) if len(references) > 0 else 0
    extraction_rate = extracted / len(references) if len(references) > 0 else 0
    
    return {
        "accuracy": accuracy,
        "extraction_rate": extraction_rate,
        "extracted_correct": correct,
        "total": len(references)
    }

def analyze_model_performance(predictions, references, test_df):
    """Analyze model performance by difficulty and problem type."""
    results = evaluate_model_predictions(predictions, references)
    print(f"Overall Accuracy: {results['accuracy']:.4f}")
    print(f"Answer extraction rate: {results['extraction_rate']:.4f}")
    print(f"Correct answers: {results['extracted_correct']} out of {results['total']}")
    
    # Analyze by difficulty
    for difficulty in ['easy', 'medium', 'hard']:
        indices = test_df[test_df['difficulty'] == difficulty].index
        diff_predictions = [predictions[i] for i in range(len(test_df)) if i in indices]
        diff_references = [references[i] for i in range(len(test_df)) if i in indices]
        diff_results = evaluate_model_predictions(diff_predictions, diff_references)
        print(f"  {difficulty.capitalize()} problems: {diff_results['accuracy']:.4f}")
    
    # Analyze by type
    for problem_type in ['numeric', 'word']:
        indices = test_df[test_df['type'] == problem_type].index
        type_predictions = [predictions[i] for i in range(len(test_df)) if i in indices]
        type_references = [references[i] for i in range(len(test_df)) if i in indices]
        type_results = evaluate_model_predictions(type_predictions, type_references)
        print(f"  {problem_type.capitalize()} problems: {type_results['accuracy']:.4f}")
    
    return results

In [4]:

# Evaluate the model
def evaluate_model(model, model_name):
    model.eval()
    predictions = []
    references = []
    
    print(f"Evaluating {model_name}...")
    for i in tqdm(range(len(test_df))):
        problem = test_df.iloc[i]['problem']
        input_text = format_for_inference(problem)
        
        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_length=64,
                do_sample=False,
                num_beams=3,
                pad_token_id=tokenizer.pad_token_id,
            )
        
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(predicted_text[len(input_text):])
        references.append(float(test_df.iloc[i]['solution']))
    
    print(f"\n{model_name} Results:")
    results = analyze_model_performance(predictions, references, test_df)
    
    # Display example predictions
    print("\nExample predictions:")
    for i in range(5):
        print(f"Problem: {test_df.iloc[i]['problem']}")
        print(f"True solution: {test_df.iloc[i]['solution']}")
        print(f"Predicted: {predictions[i]}")
        print()
    
    return predictions, references, results
    

In [None]:


from transformers import GPT2Config, GPT2LMHeadModel

# Create a smaller GPT-2 style config
small_config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=128,
    n_ctx=128,
    n_embd=256,
    n_layer=6,
    n_head=8,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

# Create model from scratch
model_from_scratch = GPT2LMHeadModel(small_config)
model_from_scratch.to(device)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results-scratch",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs-scratch",
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="none",  # Disable wandb/tensorboard
)

# Create trainer
trainer = Trainer(
    model=model_from_scratch,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

# Train the model
print("Training model from scratch...")
trainer.train()

# Save the model
trainer.save_model("./math-model-scratch")

# Run evaluation
scratch_predictions, scratch_references, scratch_results = evaluate_model(model_from_scratch, "Model from Scratch")


In [None]:

# Load pre-trained model (we'll use DistilGPT2 which is smaller but capable)
pretrained_model_name = "distilgpt2"
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)
pretrained_model.to(device)

# Evaluate zero-shot performance
print("Evaluating pretrained model in zero-shot setting...")
zs_predictions, zs_references, zs_results = evaluate_model(pretrained_model, "Zero-Shot DistilGPT2")


In [6]:

# Load pre-trained model (we'll use Qwen/Qwen2.5-Math which is smaller but capable)
pretrained_model_name = "Qwen/Qwen2.5-Math-1.5B"
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)
pretrained_model.to(device)

# Evaluate zero-shot performance
print("Evaluating pretrained model in zero-shot setting...")
zs_predictions, zs_references, zs_results = evaluate_model(pretrained_model, "Qwen/Qwen2.5-Math")



Evaluating pretrained model in zero-shot setting...
Evaluating Zero-Shot DistilGPT2...


  0%|          | 0/180 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Both `max_new_tokens` (=2048) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=2048) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=2048) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens`

KeyboardInterrupt: 

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-7B")
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Math-7B")

# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-Math-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Simple test problem
def format_for_inference(problem):
    return f"Please solve this math problem step by step and provide the final numerical answer.\nProblem: {problem}\nSolution: The answer is"
problem = "2 + 2 = ?"
prompt = f"Problem: {problem}\nSolution:"
prompt = format_for_inference(problem)

# Encode the input
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask

# Generate output
with torch.no_grad():
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=50,
        num_return_sequences=1,
        no_repeat_ngram_size=2,  # Prevent repetition
        temperature=0.7,
        do_sample=False,  # Deterministic generation
        num_beams=3,
        pad_token_id=tokenizer.pad_token_id,
    )

# Decode and analyze the output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
solution_part = generated_text[len(prompt):]

print("Full prompt + response:")
print(generated_text)
print("\nJust the solution part:")
print(solution_part)
print("\nOutput token IDs:")
print(output[0].tolist())
print("\nInput prompt token IDs:")
print(input_ids[0].tolist())

In [None]:

# 4. Fine-tuning with LoRA
# Let's fine-tune our pre-trained model using LoRA for efficiency

if PEFT_AVAILABLE:
    from peft import LoraConfig, get_peft_model, TaskType
    
    print("Fine-tuning with LoRA...")
    
    # Load pre-trained model again to start fresh
    ft_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)
    ft_model.to(device)
    
    # Define LoRA configuration with lower rank for efficiency
    lora_config = LoraConfig(
        r=8,  # Rank
        lora_alpha=16,
        target_modules=["c_attn", "c_proj"],  # Target attention layers
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    
    # Apply LoRA to the model
    ft_model = get_peft_model(ft_model, lora_config)
    print(f"Trainable parameters: {ft_model.print_trainable_parameters()}")
    
    # Training arguments for fine-tuning
    ft_training_args = TrainingArguments(
        output_dir="./results-lora",
        num_train_epochs=3,  # Fewer epochs for fine-tuning
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,
        warmup_steps=50,
        weight_decay=0.01,
        logging_dir="./logs-lora",
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=1,
        report_to="none",
    )
    
    # Trainer for fine-tuning
    ft_trainer = Trainer(
        model=ft_model,
        args=ft_training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=data_collator,
    )
    
    # Fine-tune the model
    ft_trainer.train()
    
    # Save the fine-tuned model
    ft_trainer.save_model("./math-model-finetuned")
    
    # Evaluate fine-tuned model
    ft_predictions, ft_references, ft_results = evaluate_model(ft_model, "Fine-tuned DistilGPT2")
else:
    print("Skipping LoRA fine-tuning as PEFT library is not available.")


In [None]:

# 5. Comparison and Analysis
import matplotlib.pyplot as plt

# Collect results from all approaches
all_results = {}

# Add results from each approach we've evaluated
all_results["From Scratch"] = scratch_results
all_results["Zero-Shot"] = zs_results
all_results["Few-Shot"] = few_shot_results
if PEFT_AVAILABLE:
    all_results["Fine-tuned"] = ft_results

# Create a summary table
comparison_df = pd.DataFrame({
    "Model": list(all_results.keys()),
    "Accuracy": [r["accuracy"] for r in all_results.values()],
    "Extraction Rate": [r["extraction_rate"] for r in all_results.values()],
    "Correct/Total": [f"{r['extracted_correct']}/{r['total']}" for r in all_results.values()]
})

print("Model Performance Comparison:")
print(comparison_df)

# Visualize the results
plt.figure(figsize=(10, 6))
x = comparison_df["Model"]
y = comparison_df["Accuracy"]
plt.bar(x, y, color=['blue', 'green', 'orange', 'red'][:len(x)])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, max(y) * 1.2)  # Make room for labels
for i, v in enumerate(y):
    plt.text(i, v + 0.01, f"{v:.4f}", ha='center')
plt.tight_layout()
plt.show()


In [None]:

# 6. Conclusion

In this notebook, we've explored three approaches to solving math problems with transformer models:

1. **Training from Scratch**: We trained a small GPT-2 style model from scratch on our math dataset.
2. **Zero-Shot Learning**: We used a pre-trained DistilGPT2 model to solve math problems without any additional training.
3. **Few-Shot Learning**: We provided the pre-trained model with a few examples to guide its reasoning.
4. **Fine-tuning with LoRA**: We fine-tuned the pre-trained model using parameter-efficient methods.

## Key Takeaways

1. **Pre-trained vs. From Scratch**: Pre-trained models generally perform better, even in zero-shot settings, 
   as they've learned general language patterns and some basic mathematical reasoning.

2. **Few-Shot Learning**: Adding a few examples can significantly improve performance without any parameter updates.

3. **Fine-tuning Benefits**: Fine-tuning with LoRA helps the model adapt to our specific task while 
   preserving most of the pre-trained knowledge.

4. **Problem Difficulty**: All approaches struggle more with harder problems and word problems that 
   require more complex reasoning.

These results demonstrate the power of transformer models for mathematical reasoning, and highlight
how different training paradigms can be effectively used depending on resource constraints and accuracy requirements.