# SmolLM2-1.7B Training Results Comparison

This notebook compares the original SmolLM2-1.7B model with our trained final_model.

## Training Summary
- **Dataset**: Cosmopedia-v2 (1B tokens)
- **Steps Trained**: 30,011 steps
- **Final Loss**: 3.7547
- **Training Time**: ~13 hours

## Test Configuration
- **Generation Parameters**: temperature=0.7, top_p=0.9, max_new_tokens=30
- **Prompts**: Same as used during training validation
- **Memory Strategy**: Load models sequentially to avoid OOM

In [1]:
import torch
import time
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
from pathlib import Path
import pandas as pd
from typing import List, Dict, Tuple
import numpy as np

# Check available GPU memory
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f}GB")
else:
    print("CUDA not available - using CPU")

GPU 0: NVIDIA L4
Memory: 23.7GB
GPU 1: NVIDIA L4
Memory: 23.7GB
GPU 2: NVIDIA L4
Memory: 23.7GB
GPU 3: NVIDIA L4
Memory: 23.7GB


In [6]:
# Configuration
BASE_MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B"
CHECKPOINT_PATH = "/home/ubuntu/bigdata/Training/Day4/cosmopedia-v2-1B/smollm-1.7B-cosmo-1B-production/final_model"

# Generation parameters (same as training)
GENERATION_CONFIG = {
    "temperature": 0.7,
    "top_p": 0.9,
    "max_new_tokens": 30,
    "do_sample": True,
    "pad_token_id": None  # Will be set when tokenizer is loaded
}

# Test prompts (same as used during training)
TEST_PROMPTS = [
    "The weather today is very",
    "Machine learning is a field of",
    "The capital of France is",
    "In the year 2024, technology",
    "Artificial intelligence can help",
    "The most important thing in life is",
    "Scientists have recently discovered",
    "The future of renewable energy",
    "Education is essential because",
    "Climate change represents"
]

In [7]:
def cleanup_memory():
    """Aggressive memory cleanup"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def load_model_and_tokenizer(model_path: str, is_checkpoint: bool = False) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
    """Load model and tokenizer with memory optimization"""
    print(f"Loading model from: {model_path}")
    
    # Load tokenizer
    if is_checkpoint:
        # For checkpoint, tokenizer is in the same directory
        tokenizer = AutoTokenizer.from_pretrained(model_path)
    else:
        # For base model, load from HuggingFace
        tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model with memory optimization
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    
    model.eval()
    print(f"Model loaded successfully. Parameters: {model.num_parameters():,}")
    
    return model, tokenizer

def generate_text(model, tokenizer, prompt: str, config: Dict) -> Dict:
    """Generate text and measure performance metrics"""
    # Update config with tokenizer pad token
    gen_config = config.copy()
    gen_config["pad_token_id"] = tokenizer.pad_token_id
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    # Measure generation time
    start_time = time.time()
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            **gen_config
        )
    
    generation_time = time.time() - start_time
    
    # Decode output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    new_text = generated_text[len(prompt):].strip()
    
    # Calculate tokens per second
    new_tokens = len(tokenizer.encode(new_text, add_special_tokens=False))
    tokens_per_second = new_tokens / generation_time if generation_time > 0 else 0
    
    return {
        "prompt": prompt,
        "generated_text": new_text,
        "full_output": generated_text,
        "generation_time": generation_time,
        "tokens_generated": new_tokens,
        "tokens_per_second": tokens_per_second
    }

def calculate_perplexity(model, tokenizer, text: str) -> float:
    """Calculate perplexity of generated text"""
    inputs = tokenizer(text, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        perplexity = torch.exp(loss).item()
    
    return perplexity

def test_model(model, tokenizer, model_name: str) -> List[Dict]:
    """Test model on all prompts"""
    print(f"\n=== Testing {model_name} ===")
    results = []
    
    for i, prompt in enumerate(TEST_PROMPTS):
        print(f"Testing prompt {i+1}/{len(TEST_PROMPTS)}: '{prompt}'")
        
        try:
            result = generate_text(model, tokenizer, prompt, GENERATION_CONFIG)
            
            # Calculate perplexity
            perplexity = calculate_perplexity(model, tokenizer, result["full_output"])
            result["perplexity"] = perplexity
            result["model"] = model_name
            
            results.append(result)
            
            print(f"  Generated: '{result['generated_text']}'")
            print(f"  Time: {result['generation_time']:.2f}s")
            print(f"  Tokens/sec: {result['tokens_per_second']:.1f}")
            print(f"  Perplexity: {perplexity:.2f}")
            
        except Exception as e:
            print(f"  Error: {e}")
            continue
    
    return results

## Test 1: Original SmolLM2-1.7B Model

In [4]:
# Clear memory before starting
cleanup_memory()

# Load and test original model
print("Loading original SmolLM2-1.7B model...")
original_model, original_tokenizer = load_model_and_tokenizer(BASE_MODEL_NAME, is_checkpoint=False)

# Test original model
original_results = test_model(original_model, original_tokenizer, "Original SmolLM2-1.7B")

# Clear original model from memory
del original_model, original_tokenizer
cleanup_memory()
print("\nOriginal model testing complete. Memory cleared.")

Loading original SmolLM2-1.7B model...
Loading model from: HuggingFaceTB/SmolLM2-1.7B


model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

OSError: Can't load the model for 'HuggingFaceTB/SmolLM2-1.7B'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'HuggingFaceTB/SmolLM2-1.7B' is the correct path to a directory containing a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack.

## Test 2: Trained Model (Final Model)

In [None]:
# Load and test trained model
print("Loading trained final model...")
trained_model, trained_tokenizer = load_model_and_tokenizer(CHECKPOINT_PATH, is_checkpoint=True)

# Test trained model
trained_results = test_model(trained_model, trained_tokenizer, "Trained (final_model)")

# Clear trained model from memory
del trained_model, trained_tokenizer
cleanup_memory()
print("\nTrained model testing complete. Memory cleared.")

Loading trained final model...
Loading model from: /home/ubuntu/bigdata/Training/Day4/cosmopedia-v2-1B/smollm-1.7B-cosmo-1B-production/final_model


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully. Parameters: 1,711,376,384

=== Testing Trained (Checkpoint-26000) ===
Testing prompt 1/10: 'The weather today is very'


  Generated: 'an essential aspect of life, particularly when it comes to the connection between two individuals. One such individual is the concept of "the right of the law'
  Time: 0.80s
  Tokens/sec: 37.3
  Perplexity: 12.23
Testing prompt 2/10: 'Machine learning is a field of'
  Generated: 'science that involves the study of the behavior of the human body, the human brain, and the environment of the human body. It is an essential skill'
  Time: 0.80s
  Tokens/sec: 37.5
  Perplexity: 6.04
Testing prompt 3/10: 'The capital of France is'
  Generated: 'a fascinating form of political and social science that has been explored for over centuries. It has been a fascinating and multifaceted approach in recent years due to its'
  Time: 0.80s
  Tokens/sec: 37.6
  Perplexity: 12.53
Testing prompt 4/10: 'In the year 2024, technology'
  Generated: ', and network are a critical area of information that can help us understand and analyze the data. One important aspect of data analysis is the use o

## Results Analysis

In [None]:
# Combine results for analysis
all_results = original_results + trained_results

# Create DataFrame for easier analysis
df = pd.DataFrame(all_results)

print("=== RESULTS SUMMARY ===")
print(f"Total tests completed: {len(all_results)}")
print(f"Original model results: {len(original_results)}")
print(f"Trained model results: {len(trained_results)}")

# Display results table
display_df = df[['model', 'prompt', 'generated_text', 'generation_time', 'tokens_per_second', 'perplexity']].copy()
display_df['generation_time'] = display_df['generation_time'].round(2)
display_df['tokens_per_second'] = display_df['tokens_per_second'].round(1)
display_df['perplexity'] = display_df['perplexity'].round(2)

print("\n=== DETAILED RESULTS ===")
print(display_df.to_string(index=False))

In [None]:
# Performance comparison
print("\n=== PERFORMANCE COMPARISON ===")

# Group by model
comparison = df.groupby('model').agg({
    'generation_time': ['mean', 'std'],
    'tokens_per_second': ['mean', 'std'],
    'perplexity': ['mean', 'std'],
    'tokens_generated': ['mean', 'std']
}).round(2)

print(comparison)

# Calculate improvements
if len(original_results) > 0 and len(trained_results) > 0:
    original_avg_perplexity = np.mean([r['perplexity'] for r in original_results])
    trained_avg_perplexity = np.mean([r['perplexity'] for r in trained_results])
    
    original_avg_speed = np.mean([r['tokens_per_second'] for r in original_results])
    trained_avg_speed = np.mean([r['tokens_per_second'] for r in trained_results])
    
    perplexity_improvement = ((original_avg_perplexity - trained_avg_perplexity) / original_avg_perplexity) * 100
    speed_change = ((trained_avg_speed - original_avg_speed) / original_avg_speed) * 100
    
    print(f"\n=== TRAINING IMPACT ===")
    print(f"Average Perplexity - Original: {original_avg_perplexity:.2f}, Trained: {trained_avg_perplexity:.2f}")
    print(f"Perplexity Improvement: {perplexity_improvement:+.1f}%")
    print(f"")
    print(f"Average Speed - Original: {original_avg_speed:.1f} tokens/sec, Trained: {trained_avg_speed:.1f} tokens/sec")
    print(f"Speed Change: {speed_change:+.1f}%")

In [None]:
# Side-by-side comparison for each prompt
print("\n=== SIDE-BY-SIDE GENERATION COMPARISON ===")

for prompt in TEST_PROMPTS:
    print(f"\n📝 **Prompt:** '{prompt}'")
    print("─" * 80)
    
    original_result = next((r for r in original_results if r['prompt'] == prompt), None)
    trained_result = next((r for r in trained_results if r['prompt'] == prompt), None)
    
    if original_result:
        print(f"🔵 **Original:** {original_result['generated_text']}")
        print(f"   ⏱️  {original_result['generation_time']:.2f}s | 🚀 {original_result['tokens_per_second']:.1f} tok/s | 📊 PPL: {original_result['perplexity']:.2f}")
    
    if trained_result:
        print(f"🟢 **Trained:** {trained_result['generated_text']}")
        print(f"   ⏱️  {trained_result['generation_time']:.2f}s | 🚀 {trained_result['tokens_per_second']:.1f} tok/s | 📊 PPL: {trained_result['perplexity']:.2f}")
    
    if original_result and trained_result:
        ppl_change = ((original_result['perplexity'] - trained_result['perplexity']) / original_result['perplexity']) * 100
        print(f"   📈 **Improvement:** {ppl_change:+.1f}% perplexity change")

In [None]:
# Save results to file
results_path = "/home/ubuntu/bigdata/Training/Day4/cosmopedia-v2-1B/model_comparison_results.csv"
df.to_csv(results_path, index=False)
print(f"\n💾 Results saved to: {results_path}")

# Summary statistics
print("\n=== FINAL SUMMARY ===")
print(f"✅ Training completed: 26,000 steps")
print(f"✅ Dataset: Cosmopedia-v2 (1B tokens)")
print(f"✅ Final training loss: 3.7547")
print(f"✅ Model comparison completed successfully")
print(f"✅ Results saved for further analysis")

if len(original_results) > 0 and len(trained_results) > 0:
    if perplexity_improvement > 0:
        print(f"🎉 Training was successful! Perplexity improved by {perplexity_improvement:.1f}%")
    else:
        print(f"⚠️  Training may need adjustment. Perplexity changed by {perplexity_improvement:.1f}%")

print("\n🔥 **Ready for production use or further fine-tuning!**")