# Semi-Supervised Learning with Organized Configuration

This notebook demonstrates a clean, organized approach to SSL experiments using class-based configurations.

## Key Features:
- **Centralized Configuration**: All settings managed through config classes
- **Modular Design**: Separate modules for data, models, training, and utilities
- **Reproducible Experiments**: Consistent seed setting and configuration management
- **Easy Experimentation**: Simple configuration changes for different experiments

## 1. Import Modules

In [None]:
# Import our organized modules
from config import (
    ExperimentConfig, 
    get_full_experiment_config, 
    get_baseline_config, 
    get_small_experiment_config
)
from data_utils import DataManager
from models import create_autoencoder, create_classifier, print_model_summary
from trainers import AutoencoderTrainer, ClassifierTrainer, evaluate_model
from utils import (
    set_seed, 
    setup_device, 
    create_directories, 
    print_experiment_summary, 
    visualize_features_tsne
)

import torch
import os
import matplotlib.pyplot as plt

## 2. Configuration Setup

Choose from predefined configurations or create custom ones:

In [None]:
# Option 1: Use predefined configuration
config = get_small_experiment_config()  # For quick testing
# config = get_full_experiment_config()  # For full experiment
# config = get_baseline_config()  # For baseline comparison

# Option 2: Create custom configuration
# config = ExperimentConfig()
# config.experiment_name = "my_custom_experiment"
# config.data.labeled_size = 3000
# config.data.batch_size = 64
# config.training.autoencoder_epochs = 30
# config.training.classifier_epochs = 25

# Validate and display configuration
config.validate()
print(f"Experiment: {config.experiment_name}")
print(f"Labeled samples: {config.data.labeled_size}")
print(f"Autoencoder epochs: {config.training.autoencoder_epochs}")
print(f"Classifier epochs: {config.training.classifier_epochs}")

## 3. Setup Environment

In [None]:
# Setup reproducibility and device
set_seed(config.data.random_seed)
device = config.get_device()
print(f"Using device: {device}")

# Create necessary directories
create_directories([config.training.checkpoint_dir])

## 4. Data Setup

In [None]:
# Setup data manager and load datasets
data_manager = DataManager(config.data)
data_manager.setup_datasets()
data_manager.print_data_summary()

# Get data loaders
labeled_loader, unlabeled_loader, val_loader, test_loader = data_manager.get_data_loaders()

print(f"\nData loaders created:")
print(f"  Labeled batches: {len(labeled_loader)}")
print(f"  Unlabeled batches: {len(unlabeled_loader)}")
print(f"  Validation batches: {len(val_loader)}")
print(f"  Test batches: {len(test_loader)}")

## 5. Phase 1: Autoencoder Pre-training (Optional)

In [None]:
autoencoder_results = None
pretrained_encoder = None

if config.training.autoencoder_epochs > 0:
    print("=== Phase 1: Autoencoder Pre-training ===")
    
    # Create autoencoder
    autoencoder = create_autoencoder(config.model).to(device)
    print_model_summary(autoencoder, "Autoencoder")
    
    # Train autoencoder
    autoencoder_trainer = AutoencoderTrainer(
        autoencoder, 
        config.training, 
        device,
        config.training.checkpoint_dir
    )
    
    autoencoder_results = autoencoder_trainer.train(unlabeled_loader, val_loader)
    pretrained_encoder = autoencoder.encoder
    
    print(f"\nAutoencoder training completed!")
    print(f"Best validation loss: {autoencoder_results['best_val_loss']:.6f}")
else:
    print("=== Skipping autoencoder pre-training ===")

## 6. Visualize Learned Features (if autoencoder was trained)

In [None]:
if pretrained_encoder is not None:
    print("=== Visualizing learned features ===")
    visualize_features_tsne(
        autoencoder, 
        val_loader, 
        device, 
        max_samples=2000,
        save_path=os.path.join(config.training.checkpoint_dir, "tsne_features.png")
    )

## 7. Phase 2: Classifier Training

In [None]:
print("=== Phase 2: Classifier Training ===")

# Create classifier
classifier = create_classifier(config.model, pretrained_encoder).to(device)
print_model_summary(classifier, "Classifier")

# Train classifier
classifier_trainer = ClassifierTrainer(
    classifier, 
    config.training, 
    device,
    config.training.checkpoint_dir
)

classifier_results = classifier_trainer.train(labeled_loader, val_loader)

print(f"\nClassifier training completed!")
print(f"Best validation accuracy: {classifier_results['best_val_acc']:.2f}%")

## 8. Final Evaluation

In [None]:
print("=== Final Evaluation ===")

# Evaluate on test set
test_accuracy = evaluate_model(classifier, test_loader, device)
print(f"Test accuracy: {test_accuracy:.2f}%")

# Print comprehensive experiment summary
results = {
    'Best Validation Accuracy': classifier_results['best_val_acc'],
    'Test Accuracy': test_accuracy
}

if autoencoder_results:
    results['Best Autoencoder Val Loss'] = autoencoder_results['best_val_loss']

print_experiment_summary(config, results)

## 9. Training History Visualization

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Classifier training history
history = classifier_results['history']
epochs = range(1, len(history['train_acc']) + 1)

axes[0].plot(epochs, history['train_acc'], 'b-', label='Training Accuracy')
axes[0].plot(epochs, history['val_acc'], 'r-', label='Validation Accuracy')
axes[0].set_title('Classifier Training History')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy (%)')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(epochs, history['train_loss'], 'b-', label='Training Loss')
axes[1].plot(epochs, history['val_loss'], 'r-', label='Validation Loss')
axes[1].set_title('Classifier Loss History')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Autoencoder history (if available)
if autoencoder_results:
    ae_history = autoencoder_results['history']
    ae_epochs = range(1, len(ae_history['train_loss']) + 1)
    
    plt.figure(figsize=(8, 5))
    plt.plot(ae_epochs, ae_history['train_loss'], 'b-', label='Training Loss')
    plt.plot(ae_epochs, ae_history['val_loss'], 'r-', label='Validation Loss')
    plt.title('Autoencoder Training History')
    plt.xlabel('Epoch')
    plt.ylabel('Reconstruction Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

## 10. Configuration Comparison Example

Demonstrate how easy it is to run different experiments:

In [None]:
# Example: Compare different labeled data sizes
def compare_labeled_sizes():
    """Compare performance with different amounts of labeled data."""
    labeled_sizes = [1000, 2000, 5000]
    results = []
    
    for size in labeled_sizes:
        print(f"\n=== Testing with {size} labeled samples ===")
        
        # Create configuration
        test_config = get_small_experiment_config()
        test_config.data.labeled_size = size
        test_config.experiment_name = f"ssl_labeled_{size}"
        
        # Quick setup and training (simplified)
        set_seed(test_config.data.random_seed)
        device = test_config.get_device()
        
        # Setup data
        dm = DataManager(test_config.data)
        dm.setup_datasets()
        labeled_loader, _, val_loader, test_loader = dm.get_data_loaders()
        
        # Train classifier (skip autoencoder for speed)
        classifier = create_classifier(test_config.model).to(device)
        trainer = ClassifierTrainer(classifier, test_config.training, device)
        train_results = trainer.train(labeled_loader, val_loader)
        
        # Evaluate
        test_acc = evaluate_model(classifier, test_loader, device)
        
        results.append({
            'labeled_size': size,
            'val_acc': train_results['best_val_acc'],
            'test_acc': test_acc
        })
        
        print(f"Results: Val Acc: {train_results['best_val_acc']:.2f}%, Test Acc: {test_acc:.2f}%")
    
    return results

# Uncomment to run comparison
# comparison_results = compare_labeled_sizes()
print("Comparison function defined. Uncomment the line above to run.")

## Summary

This organized approach provides:

1. **Clean Configuration Management**: All experiment settings in one place
2. **Modular Design**: Easy to modify individual components
3. **Reproducible Experiments**: Consistent seed setting and configuration
4. **Easy Experimentation**: Simple configuration changes for different setups
5. **Professional Code Structure**: Follows software engineering best practices

### Key Benefits:
- **Maintainable**: Easy to understand and modify
- **Extensible**: Simple to add new models, datasets, or training strategies
- **Reproducible**: Consistent results across runs
- **Configurable**: Easy to run different experiments
- **Professional**: Clean, organized code structure