# Neural Network Fundamentals: Architecture and Implementation

This notebook dives deep into neural network architectures, layer types, and implementation details.

## What You'll Master:
1. **Dense/Fully Connected Layers**: The building blocks
2. **Multi-layer Perceptrons**: Stacking layers for complexity
3. **Universal Approximation**: Why neural networks work
4. **Regularization**: Preventing overfitting
5. **Batch Processing**: Efficient training
6. **Real-world Examples**: Classification and regression

**Prerequisites**: Complete `01_deep_learning_foundations.ipynb` first.

In [None]:
# Cell 1: Comprehensive Imports with Documentation
"""
LIBRARY ECOSYSTEM EXPLANATION:

TensorFlow/Keras Stack:
- tensorflow: Google's deep learning framework
- keras: High-level API built into TensorFlow
- Why this combination: Easy to use, industry standard, great documentation

Scientific Computing:
- numpy: Numerical operations, works seamlessly with TensorFlow
- pandas: Data manipulation and analysis
- scikit-learn: Traditional ML algorithms, preprocessing, metrics

Visualization:
- matplotlib: Core plotting library
- seaborn: Statistical plotting with better defaults
- plotly: Interactive plots (optional)
"""

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.utils import plot_model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_regression, load_wine, load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

print(f"🔧 ENVIRONMENT SETUP")
print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")
print(f"Random seed: {RANDOM_SEED} (for reproducible results)")

# Configure plotting
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11
print("\n✅ All libraries imported and configured successfully!")

## 1. Dense (Fully Connected) Layers: The Foundation

**What is a Dense Layer?**
- Every input neuron connects to every output neuron
- Mathematical operation: `output = activation(input @ weights + bias)`
- Most fundamental building block of neural networks

**Parameters in a Dense Layer:**
- **Weights**: `(input_size, output_size)` matrix
- **Bias**: `(output_size,)` vector
- **Total parameters**: `input_size * output_size + output_size`

**Key Concepts:**
- **Fan-in**: Number of inputs to a neuron
- **Fan-out**: Number of outputs from a neuron
- **Weight initialization**: Critical for training success
- **Activation function**: Introduces non-linearity

In [None]:
# Cell 2: Dense Layer Deep Dive

print("=== DENSE LAYER ANATOMY ===")

# Create a simple dense layer to examine
input_dim = 3
output_dim = 2
batch_size = 5

print(f"\n🏗️ LAYER CONFIGURATION:")
print(f"Input dimension: {input_dim} (e.g., height, weight, age)")
print(f"Output dimension: {output_dim} (e.g., probability of class A, B)")
print(f"Batch size: {batch_size} (number of samples processed together)")

# Method 1: Manual implementation to understand internals
print(f"\n🔧 MANUAL IMPLEMENTATION:")

# Initialize weights using different strategies
def compare_initializations(input_dim, output_dim):
    """Compare different weight initialization strategies"""
    
    initializations = {
        'zeros': tf.zeros((input_dim, output_dim)),
        'ones': tf.ones((input_dim, output_dim)),
        'random_normal': tf.random.normal((input_dim, output_dim), mean=0, stddev=1),
        'random_uniform': tf.random.uniform((input_dim, output_dim), -1, 1),
        'xavier_uniform': tf.random.uniform((input_dim, output_dim), 
                                          -tf.sqrt(6.0/(input_dim + output_dim)), 
                                          tf.sqrt(6.0/(input_dim + output_dim))),
        'he_normal': tf.random.normal((input_dim, output_dim), 
                                    mean=0, stddev=tf.sqrt(2.0/input_dim))
    }
    
    return initializations

weight_examples = compare_initializations(input_dim, output_dim)

print(f"Weight initialization comparison:")
for name, weights in weight_examples.items():
    mean_val = tf.reduce_mean(weights)
    std_val = tf.math.reduce_std(weights)
    min_val = tf.reduce_min(weights)
    max_val = tf.reduce_max(weights)
    
    print(f"  {name:15s}: mean={mean_val:.4f}, std={std_val:.4f}, range=[{min_val:.4f}, {max_val:.4f}]")

print(f"\n💡 INITIALIZATION INSIGHTS:")
print(f"• Zeros/Ones: Bad! All neurons learn the same thing (symmetry problem)")
print(f"• Random Normal/Uniform: Simple but may cause vanishing/exploding gradients")
print(f"• Xavier: Good for sigmoid/tanh activations")
print(f"• He: Good for ReLU activations (most common choice)")

# Use He initialization for our example (good for ReLU)
W = tf.Variable(weight_examples['he_normal'], name='weights')
b = tf.Variable(tf.zeros((output_dim,)), name='bias')

print(f"\n⚖️ SELECTED PARAMETERS:")
print(f"Weights (W) shape: {W.shape}")
print(f"Weights values:\n{W.numpy()}")
print(f"Bias (b) shape: {b.shape}")
print(f"Bias values: {b.numpy()}")
print(f"Total parameters: {tf.size(W) + tf.size(b)} = {tf.size(W)} + {tf.size(b)}")

# Create sample input data
X_sample = tf.constant([
    [1.0, 2.0, 3.0],   # Person 1: height=1.0, weight=2.0, age=3.0
    [1.5, 2.5, 2.0],   # Person 2
    [0.8, 1.8, 4.0],   # Person 3
    [1.2, 3.0, 1.5],   # Person 4
    [2.0, 1.5, 2.5]    # Person 5
])

print(f"\n📊 SAMPLE INPUT DATA:")
print(f"Shape: {X_sample.shape} (batch_size={batch_size}, features={input_dim})")
print(f"Data:\n{X_sample.numpy()}")

# Manual forward pass step by step
print(f"\n🧮 MANUAL FORWARD PASS:")

# Step 1: Linear transformation (matrix multiplication)
linear_output = tf.matmul(X_sample, W)
print(f"Step 1 - Matrix multiplication (X @ W):")
print(f"  Input shape: {X_sample.shape}")
print(f"  Weight shape: {W.shape}")
print(f"  Output shape: {linear_output.shape}")
print(f"  Result:\n{linear_output.numpy()}")

# Step 2: Add bias
linear_plus_bias = linear_output + b
print(f"\nStep 2 - Add bias (linear + b):")
print(f"  Linear output: {linear_output.shape}")
print(f"  Bias: {b.shape}")
print(f"  Result (broadcasting):\n{linear_plus_bias.numpy()}")

# Step 3: Apply activation function
activated_output = tf.nn.relu(linear_plus_bias)
print(f"\nStep 3 - Apply ReLU activation:")
print(f"  Before ReLU:\n{linear_plus_bias.numpy()}")
print(f"  After ReLU:\n{activated_output.numpy()}")
print(f"  Effect: Negative values → 0, Positive values → unchanged")

# Method 2: Using Keras Dense layer (equivalent result)
print(f"\n🎛️ KERAS DENSE LAYER (equivalent):")

# Create Keras dense layer
dense_layer = layers.Dense(
    units=output_dim,           # Number of output neurons
    activation='relu',          # Activation function
    kernel_initializer='he_normal',  # Weight initialization
    bias_initializer='zeros',   # Bias initialization
    use_bias=True,             # Whether to use bias (default True)
    name='example_dense_layer'
)

# Build the layer by calling it with input
keras_output = dense_layer(X_sample)

print(f"Keras Dense layer configuration:")
print(f"  Units (neurons): {dense_layer.units}")
print(f"  Activation: {dense_layer.activation.__name__}")
print(f"  Input shape: {dense_layer.input_shape}")
print(f"  Output shape: {keras_output.shape}")
print(f"  Trainable parameters: {dense_layer.count_params()}")

print(f"\nKeras layer weights:")
print(f"  Kernel (weights) shape: {dense_layer.kernel.shape}")
print(f"  Bias shape: {dense_layer.bias.shape}")

# Visualize the dense layer operation
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot 1: Input data
axes[0, 0].imshow(X_sample.numpy(), cmap='viridis', aspect='auto')
axes[0, 0].set_title(f'Input Data\nShape: {X_sample.shape}')
axes[0, 0].set_xlabel('Features')
axes[0, 0].set_ylabel('Samples')
for i in range(X_sample.shape[0]):
    for j in range(X_sample.shape[1]):
        axes[0, 0].text(j, i, f'{X_sample[i,j]:.1f}', ha='center', va='center', color='white')

# Plot 2: Weights
im = axes[0, 1].imshow(W.numpy(), cmap='RdBu', aspect='auto')
axes[0, 1].set_title(f'Weights\nShape: {W.shape}')
axes[0, 1].set_xlabel('Output Neurons')
axes[0, 1].set_ylabel('Input Features')
plt.colorbar(im, ax=axes[0, 1])
for i in range(W.shape[0]):
    for j in range(W.shape[1]):
        axes[0, 1].text(j, i, f'{W[i,j]:.2f}', ha='center', va='center')

# Plot 3: Linear output (before activation)
axes[0, 2].imshow(linear_plus_bias.numpy(), cmap='coolwarm', aspect='auto')
axes[0, 2].set_title(f'Linear Output (X@W + b)\nShape: {linear_plus_bias.shape}')
axes[0, 2].set_xlabel('Output Neurons')
axes[0, 2].set_ylabel('Samples')
for i in range(linear_plus_bias.shape[0]):
    for j in range(linear_plus_bias.shape[1]):
        axes[0, 2].text(j, i, f'{linear_plus_bias[i,j]:.2f}', ha='center', va='center')

# Plot 4: Activated output
axes[1, 0].imshow(activated_output.numpy(), cmap='plasma', aspect='auto')
axes[1, 0].set_title(f'After ReLU Activation\nShape: {activated_output.shape}')
axes[1, 0].set_xlabel('Output Neurons')
axes[1, 0].set_ylabel('Samples')
for i in range(activated_output.shape[0]):
    for j in range(activated_output.shape[1]):
        axes[1, 0].text(j, i, f'{activated_output[i,j]:.2f}', ha='center', va='center')

# Plot 5: Parameter count breakdown
param_types = ['Weights', 'Biases']
param_counts = [tf.size(W).numpy(), tf.size(b).numpy()]
axes[1, 1].bar(param_types, param_counts, color=['skyblue', 'lightcoral'])
axes[1, 1].set_title('Parameter Count Breakdown')
axes[1, 1].set_ylabel('Number of Parameters')
for i, count in enumerate(param_counts):
    axes[1, 1].text(i, count + 0.1, str(count), ha='center', va='bottom')

# Plot 6: Activation comparison
x_vals = np.linspace(-3, 3, 100)
relu_vals = np.maximum(0, x_vals)
sigmoid_vals = 1 / (1 + np.exp(-x_vals))
tanh_vals = np.tanh(x_vals)

axes[1, 2].plot(x_vals, relu_vals, 'b-', linewidth=2, label='ReLU')
axes[1, 2].plot(x_vals, sigmoid_vals, 'r--', linewidth=2, label='Sigmoid')
axes[1, 2].plot(x_vals, tanh_vals, 'g:', linewidth=2, label='Tanh')
axes[1, 2].set_title('Activation Functions Comparison')
axes[1, 2].set_xlabel('Input')
axes[1, 2].set_ylabel('Output')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)
axes[1, 2].axhline(y=0, color='k', linestyle='-', alpha=0.3)
axes[1, 2].axvline(x=0, color='k', linestyle='-', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n✨ DENSE LAYER KEY INSIGHTS:")
print(f"• Each output neuron is a linear combination of ALL input features")
print(f"• Weights determine the strength and direction of connections")
print(f"• Bias allows the neuron to activate even when all inputs are zero")
print(f"• Activation function introduces non-linearity (critical for learning complex patterns)")
print(f"• Parameter count grows as input_size × output_size (can get large quickly!)")

## 2. Multi-Layer Perceptrons (MLPs): Stacking for Power

**Why Multiple Layers?**
- Single layer = linear classifier (limited to linear decision boundaries)
- Multiple layers = universal approximator (can learn any function!)
- Each layer learns increasingly complex features

**Architecture Design Principles:**
1. **Input Layer**: Matches your data dimensions
2. **Hidden Layers**: 
   - Start wide, gradually narrow (funnel shape)
   - Or use consistent width
   - More layers = more complexity, but harder to train
3. **Output Layer**: Matches your target
   - Regression: 1 neuron, no activation (or linear)
   - Binary classification: 1 neuron, sigmoid activation
   - Multi-class: n_classes neurons, softmax activation

**Common Architectures:**
- **Small datasets**: 2-3 hidden layers, 10-100 neurons each
- **Large datasets**: 3-5 hidden layers, 100-1000 neurons each
- **Rule of thumb**: Start simple, add complexity if needed

In [None]:
# Cell 3: Multi-Layer Perceptron Architecture Design

print("=== MULTI-LAYER PERCEPTRON DESIGN ===")

# Generate a complex dataset that requires non-linear boundaries
print(f"\n🎯 CREATING COMPLEX DATASET:")
print(f"Problem: Non-linear classification (moons dataset)")

from sklearn.datasets import make_moons

# Create non-linear dataset
X, y = make_moons(n_samples=1000, noise=0.1, random_state=RANDOM_SEED)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

print(f"Dataset characteristics:")
print(f"  Total samples: {len(X)}")
print(f"  Features: {X.shape[1]} (x, y coordinates)")
print(f"  Classes: {len(np.unique(y))} (binary classification)")
print(f"  Training set: {len(X_train)} samples")
print(f"  Test set: {len(X_test)} samples")
print(f"  Class distribution: {np.bincount(y)}")

# Standardize features (important for neural networks!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n📏 FEATURE SCALING:")
print(f"Before scaling - mean: {X_train.mean(axis=0)}, std: {X_train.std(axis=0)}")
print(f"After scaling  - mean: {X_train_scaled.mean(axis=0)}, std: {X_train_scaled.std(axis=0)}")
print(f"Why scaling? Neural networks are sensitive to input scales!")

# Design different MLP architectures
def create_mlp_architectures():
    """Create different MLP architectures for comparison"""
    
    architectures = {
        'shallow': {
            'description': 'Single hidden layer',
            'layers': [10],
            'reasoning': 'Simple baseline - may struggle with complex patterns'
        },
        'medium': {
            'description': 'Two hidden layers',
            'layers': [20, 10],
            'reasoning': 'More capacity - can learn more complex decision boundaries'
        },
        'deep': {
            'description': 'Three hidden layers',
            'layers': [32, 16, 8],
            'reasoning': 'Deep network - can learn hierarchical features'
        },
        'wide': {
            'description': 'Wide single layer',
            'layers': [50],
            'reasoning': 'High capacity in single layer - good for tabular data'
        },
        'deep_narrow': {
            'description': 'Deep but narrow',
            'layers': [8, 8, 8, 8],
            'reasoning': 'Many layers, few neurons - may have vanishing gradients'
        }
    }
    
    return architectures

architectures = create_mlp_architectures()

print(f"\n🏗️ MLP ARCHITECTURE COMPARISON:")
for name, config in architectures.items():
    print(f"\n{name.upper()}:")
    print(f"  Description: {config['description']}")
    print(f"  Hidden layers: {config['layers']}")
    print(f"  Reasoning: {config['reasoning']}")
    
    # Calculate parameter count
    total_params = 0
    prev_size = 2  # Input features
    
    for layer_size in config['layers']:
        total_params += prev_size * layer_size + layer_size  # weights + biases
        prev_size = layer_size
    
    # Output layer
    total_params += prev_size * 1 + 1  # binary classification
    
    print(f"  Total parameters: {total_params}")

# Build and compare models
def build_mlp(hidden_layers, input_dim=2, output_dim=1, name="mlp"):
    """Build MLP with specified architecture"""
    
    model = models.Sequential(name=name)
    
    # Input layer (implicit)
    model.add(layers.Input(shape=(input_dim,)))
    
    # Hidden layers
    for i, units in enumerate(hidden_layers):
        model.add(layers.Dense(
            units=units,
            activation='relu',
            kernel_initializer='he_normal',
            name=f'hidden_{i+1}'
        ))
    
    # Output layer
    model.add(layers.Dense(
        units=output_dim,
        activation='sigmoid',  # Binary classification
        name='output'
    ))
    
    return model

# Build all models
models_dict = {}
for name, config in architectures.items():
    model = build_mlp(config['layers'], name=name)
    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    models_dict[name] = model

print(f"\n🔧 MODEL COMPILATION:")
print(f"Optimizer: Adam (learning_rate=0.001)")
print(f"Loss: Binary crossentropy (for binary classification)")
print(f"Metrics: Accuracy")

# Show detailed architecture for one model
print(f"\n📋 DETAILED ARCHITECTURE (MEDIUM MODEL):")
models_dict['medium'].summary()

# Visualize the dataset
plt.figure(figsize=(15, 10))

# Plot 1: Original dataset
plt.subplot(2, 3, 1)
colors = ['red', 'blue']
for i in range(2):
    mask = y == i
    plt.scatter(X[mask, 0], X[mask, 1], c=colors[i], alpha=0.7, label=f'Class {i}', s=30)

plt.title('Original Dataset (Moons)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Scaled dataset
plt.subplot(2, 3, 2)
for i in range(2):
    mask = y_train == i
    plt.scatter(X_train_scaled[mask, 0], X_train_scaled[mask, 1], 
               c=colors[i], alpha=0.7, label=f'Class {i}', s=30)

plt.title('Scaled Training Data')
plt.xlabel('Scaled Feature 1')
plt.ylabel('Scaled Feature 2')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 3-6: Architecture visualizations
selected_models = ['shallow', 'medium', 'deep', 'wide']
for i, model_name in enumerate(selected_models):
    plt.subplot(2, 3, i + 3)
    
    model = models_dict[model_name]
    arch = architectures[model_name]
    
    # Simple architecture visualization
    layers = [2] + arch['layers'] + [1]  # input + hidden + output
    y_positions = np.arange(len(layers))
    
    for j, (y_pos, layer_size) in enumerate(zip(y_positions, layers)):
        # Draw nodes
        if j == 0:
            color = 'lightgreen'
            label = f'Input\n({layer_size})'
        elif j == len(layers) - 1:
            color = 'lightcoral'
            label = f'Output\n({layer_size})'
        else:
            color = 'lightblue'
            label = f'Hidden {j}\n({layer_size})'
        
        circle = plt.Circle((0, y_pos), 0.3, color=color, alpha=0.7)
        plt.gca().add_patch(circle)
        plt.text(0, y_pos, label, ha='center', va='center', fontsize=8)
        
        # Draw connections
        if j < len(layers) - 1:
            plt.plot([0.3, -0.3], [y_pos, y_pos + 1], 'k-', alpha=0.5)
    
    plt.xlim(-1, 1)
    plt.ylim(-0.5, len(layers) - 0.5)
    plt.title(f'{model_name.capitalize()} Architecture\n({model.count_params()} params)')
    plt.axis('off')

plt.tight_layout()
plt.show()

print(f"\n💡 ARCHITECTURE DESIGN PRINCIPLES:")
print(f"\n1. DEPTH vs WIDTH:")
print(f"   • More layers (depth): Can learn hierarchical features")
print(f"   • More neurons (width): More capacity per layer")
print(f"   • Trade-off: Training difficulty vs. expressiveness")

print(f"\n2. PARAMETER COUNT:")
print(f"   • More parameters: Higher capacity but risk of overfitting")
print(f"   • Fewer parameters: Less overfitting but may underfit")
print(f"   • Rule: Start simple, add complexity as needed")

print(f"\n3. ACTIVATION FUNCTIONS:")
print(f"   • Hidden layers: ReLU (most common, works well)")
print(f"   • Output layer: Depends on problem type")
print(f"     - Regression: Linear or no activation")
print(f"     - Binary classification: Sigmoid")
print(f"     - Multi-class: Softmax")

print(f"\n4. INITIALIZATION:")
print(f"   • He initialization: Good for ReLU networks")
print(f"   • Xavier: Good for sigmoid/tanh networks")
print(f"   • Proper initialization prevents vanishing/exploding gradients")

## 3. Training Process: Understanding What Happens During Learning

**The Training Loop Explained:**
1. **Forward Pass**: Data flows through network, predictions computed
2. **Loss Calculation**: Compare predictions to true labels
3. **Backward Pass**: Compute gradients via backpropagation
4. **Weight Update**: Optimizer adjusts weights using gradients
5. **Repeat**: Until convergence or max epochs

**Key Training Concepts:**
- **Epoch**: One complete pass through entire training dataset
- **Batch**: Subset of data processed together (for efficiency)
- **Learning Rate**: How big steps to take during optimization
- **Validation**: Monitor performance on unseen data

**Monitoring Training:**
- **Loss curves**: Should generally decrease over time
- **Accuracy curves**: Should generally increase over time
- **Overfitting signs**: Training accuracy ≫ validation accuracy

In [None]:
# Cell 4: Training Process Deep Dive

print("=== TRAINING PROCESS ANALYSIS ===")

# Train all models and compare their performance
training_histories = {}
training_times = {}

print(f"\n🚂 TRAINING ALL MODELS:")
print(f"Training configuration:")
print(f"  Epochs: 100")
print(f"  Batch size: 32")
print(f"  Validation split: 20% of training data")
print(f"  Early stopping: Patience=10 (stop if no improvement)")

# Setup callbacks
early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=0
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=0
)

print(f"\n📋 CALLBACKS CONFIGURED:")
print(f"  Early Stopping: Stops training if validation loss doesn't improve for 10 epochs")
print(f"  Learning Rate Reduction: Halves learning rate if val_loss plateaus for 5 epochs")

# Train models
for name, model in models_dict.items():
    print(f"\nTraining {name.upper()} model...")
    
    import time
    start_time = time.time()
    
    history = model.fit(
        X_train_scaled, y_train,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping, reduce_lr],
        verbose=0
    )
    
    end_time = time.time()
    training_time = end_time - start_time
    
    training_histories[name] = history.history
    training_times[name] = training_time
    
    # Evaluate on test set
    test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
    
    epochs_trained = len(history.history['loss'])
    final_val_loss = min(history.history['val_loss'])
    final_val_acc = max(history.history['val_accuracy'])
    
    print(f"  Epochs trained: {epochs_trained}")
    print(f"  Training time: {training_time:.2f} seconds")
    print(f"  Best val_loss: {final_val_loss:.4f}")
    print(f"  Best val_accuracy: {final_val_acc:.4f}")
    print(f"  Test accuracy: {test_accuracy:.4f}")

# Analyze training curves
print(f"\n📊 TRAINING ANALYSIS:")

# Create comprehensive training visualization
fig, axes = plt.subplots(3, 2, figsize=(16, 18))

# Plot 1: Training loss curves
axes[0, 0].set_title('Training Loss Curves')
for name, history in training_histories.items():
    axes[0, 0].plot(history['loss'], label=f'{name} (params: {models_dict[name].count_params()})', linewidth=2)
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_yscale('log')

# Plot 2: Validation loss curves
axes[0, 1].set_title('Validation Loss Curves')
for name, history in training_histories.items():
    axes[0, 1].plot(history['val_loss'], label=name, linewidth=2)
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Validation Loss')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_yscale('log')

# Plot 3: Training accuracy curves
axes[1, 0].set_title('Training Accuracy Curves')
for name, history in training_histories.items():
    axes[1, 0].plot(history['accuracy'], label=name, linewidth=2)
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Training Accuracy')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_ylim(0, 1)

# Plot 4: Validation accuracy curves
axes[1, 1].set_title('Validation Accuracy Curves')
for name, history in training_histories.items():
    axes[1, 1].plot(history['val_accuracy'], label=name, linewidth=2)
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Validation Accuracy')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].set_ylim(0, 1)

# Plot 5: Model comparison metrics
model_names = list(models_dict.keys())
test_accuracies = []
param_counts = []
train_times = []

for name in model_names:
    model = models_dict[name]
    test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
    test_accuracies.append(test_acc)
    param_counts.append(model.count_params())
    train_times.append(training_times[name])

x_pos = np.arange(len(model_names))
bars = axes[2, 0].bar(x_pos, test_accuracies, color='skyblue', alpha=0.7)
axes[2, 0].set_title('Test Accuracy Comparison')
axes[2, 0].set_xlabel('Model Architecture')
axes[2, 0].set_ylabel('Test Accuracy')
axes[2, 0].set_xticks(x_pos)
axes[2, 0].set_xticklabels(model_names, rotation=45)
axes[2, 0].grid(True, alpha=0.3)
axes[2, 0].set_ylim(0, 1)

# Add accuracy values on bars
for bar, acc in zip(bars, test_accuracies):
    axes[2, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                   f'{acc:.3f}', ha='center', va='bottom')

# Plot 6: Parameters vs Performance
axes[2, 1].scatter(param_counts, test_accuracies, s=100, c=train_times, 
                  cmap='viridis', alpha=0.7)
axes[2, 1].set_xlabel('Number of Parameters')
axes[2, 1].set_ylabel('Test Accuracy')
axes[2, 1].set_title('Parameters vs Performance\n(Color = Training Time)')
axes[2, 1].grid(True, alpha=0.3)

# Add model labels
for i, name in enumerate(model_names):
    axes[2, 1].annotate(name, (param_counts[i], test_accuracies[i]), 
                       xytext=(5, 5), textcoords='offset points', fontsize=9)

# Add colorbar for training time
sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(vmin=min(train_times), vmax=max(train_times)))
sm.set_array([])
cbar = plt.colorbar(sm, ax=axes[2, 1])
cbar.set_label('Training Time (seconds)')

plt.tight_layout()
plt.show()

# Detailed analysis
print(f"\n🔍 DETAILED TRAINING ANALYSIS:")

# Find best performing model
best_model_idx = np.argmax(test_accuracies)
best_model_name = model_names[best_model_idx]
best_accuracy = test_accuracies[best_model_idx]

print(f"\n🏆 BEST PERFORMING MODEL: {best_model_name.upper()}")
print(f"  Test accuracy: {best_accuracy:.4f}")
print(f"  Parameters: {param_counts[best_model_idx]:,}")
print(f"  Training time: {train_times[best_model_idx]:.2f} seconds")
print(f"  Architecture: {architectures[best_model_name]['layers']}")

# Efficiency analysis
efficiency_scores = np.array(test_accuracies) / (np.array(param_counts) / 1000)  # accuracy per 1000 params
most_efficient_idx = np.argmax(efficiency_scores)
most_efficient_name = model_names[most_efficient_idx]

print(f"\n⚡ MOST EFFICIENT MODEL: {most_efficient_name.upper()}")
print(f"  Efficiency score: {efficiency_scores[most_efficient_idx]:.4f} (accuracy per 1000 params)")
print(f"  Test accuracy: {test_accuracies[most_efficient_idx]:.4f}")
print(f"  Parameters: {param_counts[most_efficient_idx]:,}")

# Overfitting analysis
print(f"\n🎯 OVERFITTING ANALYSIS:")
for name in model_names:
    history = training_histories[name]
    final_train_acc = history['accuracy'][-1]
    final_val_acc = history['val_accuracy'][-1]
    test_acc = test_accuracies[model_names.index(name)]
    
    overfitting_gap = final_train_acc - final_val_acc
    generalization_gap = final_val_acc - test_acc
    
    status = "🟢 Good" if overfitting_gap < 0.05 else "🟡 Moderate" if overfitting_gap < 0.1 else "🔴 High"
    
    print(f"  {name:12s}: Train={final_train_acc:.3f}, Val={final_val_acc:.3f}, Test={test_acc:.3f} | "
          f"Overfitting gap={overfitting_gap:.3f} {status}")

print(f"\n💡 TRAINING INSIGHTS:")
print(f"\n1. CONVERGENCE PATTERNS:")
print(f"   • Shallow models: Fast convergence, may underfit complex patterns")
print(f"   • Deep models: Slower convergence, higher capacity for complex patterns")
print(f"   • Wide models: Good balance of speed and performance")

print(f"\n2. PARAMETER EFFICIENCY:")
print(f"   • More parameters ≠ always better performance")
print(f"   • Sweet spot depends on data complexity and size")
print(f"   • Regularization helps prevent overfitting with more parameters")

print(f"\n3. TRAINING TIME FACTORS:")
print(f"   • Model depth: More layers = longer training")
print(f"   • Model width: More neurons = more computations")
print(f"   • Dataset size: More samples = longer epochs")
print(f"   • Batch size: Larger batches = fewer updates per epoch")

## 4. Regularization: Preventing Overfitting

**What is Overfitting?**
- Model memorizes training data instead of learning general patterns
- High training accuracy, low validation/test accuracy
- Model becomes too complex for the amount of training data

**Common Regularization Techniques:**

1. **Dropout**: Randomly "turn off" neurons during training
   - Prevents co-adaptation of neurons
   - Typical rates: 0.2-0.5 (20%-50% of neurons dropped)

2. **L1/L2 Regularization**: Add penalty for large weights
   - L1: Promotes sparsity (many weights become zero)
   - L2: Promotes small weights (weight decay)

3. **Batch Normalization**: Normalize inputs to each layer
   - Stabilizes training, acts as regularizer
   - Allows higher learning rates

4. **Early Stopping**: Stop training when validation loss stops improving
   - Simple but effective
   - Prevents overfitting to training data

5. **Data Augmentation**: Artificially increase dataset size
   - Add noise, rotate, scale images
   - Forces model to be robust to variations

In [None]:
# Cell 5: Regularization Techniques Comprehensive Analysis

print("=== REGULARIZATION TECHNIQUES ===")

# Create a more challenging dataset prone to overfitting
print(f"\n🎯 CREATING OVERFITTING-PRONE DATASET:")

# Small dataset with many features (classic overfitting scenario)
X_over, y_over = make_classification(
    n_samples=500,      # Small dataset
    n_features=20,      # Many features
    n_informative=5,    # Only 5 features are actually useful
    n_redundant=5,      # 5 features are redundant
    n_clusters_per_class=1,
    class_sep=0.8,      # Moderate class separation
    random_state=RANDOM_SEED
)

X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(
    X_over, y_over, test_size=0.3, random_state=RANDOM_SEED
)

# Scale the data
scaler_over = StandardScaler()
X_train_scaled_over = scaler_over.fit_transform(X_train_over)
X_test_scaled_over = scaler_over.transform(X_test_over)

print(f"Dataset characteristics (overfitting scenario):")
print(f"  Training samples: {len(X_train_over)}")
print(f"  Test samples: {len(X_test_over)}")
print(f"  Features: {X_over.shape[1]}")
print(f"  Informative features: 5 (others are noise/redundant)")
print(f"  Challenge: Small dataset + many features = overfitting risk")

# Define regularization techniques
def create_regularized_models():
    """Create models with different regularization techniques"""
    
    models = {}
    
    # 1. No regularization (baseline - will overfit)
    models['no_reg'] = models.Sequential([
        layers.Input(shape=(20,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ], name='no_regularization')
    
    # 2. Dropout regularization
    models['dropout'] = models.Sequential([
        layers.Input(shape=(20,)),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),  # Drop 30% of neurons
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(16, activation='relu'),
        layers.Dropout(0.2),  # Less dropout in final layers
        layers.Dense(1, activation='sigmoid')
    ], name='dropout_regularization')
    
    # 3. L2 (weight decay) regularization
    models['l2_reg'] = models.Sequential([
        layers.Input(shape=(20,)),
        layers.Dense(64, activation='relu', 
                    kernel_regularizer=keras.regularizers.l2(0.01)),
        layers.Dense(32, activation='relu',
                    kernel_regularizer=keras.regularizers.l2(0.01)),
        layers.Dense(16, activation='relu',
                    kernel_regularizer=keras.regularizers.l2(0.01)),
        layers.Dense(1, activation='sigmoid')
    ], name='l2_regularization')
    
    # 4. L1 regularization (sparsity)
    models['l1_reg'] = models.Sequential([
        layers.Input(shape=(20,)),
        layers.Dense(64, activation='relu',
                    kernel_regularizer=keras.regularizers.l1(0.01)),
        layers.Dense(32, activation='relu',
                    kernel_regularizer=keras.regularizers.l1(0.01)),
        layers.Dense(16, activation='relu',
                    kernel_regularizer=keras.regularizers.l1(0.01)),
        layers.Dense(1, activation='sigmoid')
    ], name='l1_regularization')
    
    # 5. Batch normalization
    models['batch_norm'] = models.Sequential([
        layers.Input(shape=(20,)),
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),  # Normalize layer inputs
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(16, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(1, activation='sigmoid')
    ], name='batch_normalization')
    
    # 6. Combined regularization (the "kitchen sink" approach)
    models['combined'] = models.Sequential([
        layers.Input(shape=(20,)),
        layers.Dense(64, activation='relu',
                    kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        layers.Dense(32, activation='relu',
                    kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        layers.Dense(16, activation='relu',
                    kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.Dropout(0.1),
        layers.Dense(1, activation='sigmoid')
    ], name='combined_regularization')
    
    return models

reg_models = create_regularized_models()

print(f"\n🛡️ REGULARIZATION TECHNIQUES OVERVIEW:")
regularization_explanations = {
    'no_reg': 'No regularization - baseline model (expected to overfit)',
    'dropout': 'Dropout layers (0.2-0.3 rate) - prevents neuron co-adaptation',
    'l2_reg': 'L2 weight penalty (0.01) - encourages small weights',
    'l1_reg': 'L1 weight penalty (0.01) - encourages sparse weights',
    'batch_norm': 'Batch normalization - stabilizes training, implicit regularization',
    'combined': 'Multiple techniques - L2 + BatchNorm + Dropout'
}

for name, description in regularization_explanations.items():
    model = reg_models[name]
    print(f"  {name:12s}: {description}")
    print(f"                 Parameters: {model.count_params():,}")

# Compile all models
for name, model in reg_models.items():
    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

# Train all models and track overfitting
print(f"\n🚂 TRAINING REGULARIZED MODELS:")
reg_histories = {}
reg_train_times = {}

# Use early stopping for all models
early_stop = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=0
)

for name, model in reg_models.items():
    print(f"Training {name}...")
    
    start_time = time.time()
    
    history = model.fit(
        X_train_scaled_over, y_train_over,
        epochs=200,
        batch_size=16,  # Smaller batch for this small dataset
        validation_split=0.2,
        callbacks=[early_stop],
        verbose=0
    )
    
    end_time = time.time()
    
    reg_histories[name] = history.history
    reg_train_times[name] = end_time - start_time
    
    # Evaluate overfitting
    final_train_acc = history.history['accuracy'][-1]
    final_val_acc = history.history['val_accuracy'][-1]
    test_loss, test_acc = model.evaluate(X_test_scaled_over, y_test_over, verbose=0)
    
    overfitting_gap = final_train_acc - final_val_acc
    epochs_trained = len(history.history['loss'])
    
    print(f"  Epochs: {epochs_trained:3d} | Train: {final_train_acc:.3f} | "
          f"Val: {final_val_acc:.3f} | Test: {test_acc:.3f} | Gap: {overfitting_gap:.3f}")

# Comprehensive visualization of regularization effects
fig, axes = plt.subplots(3, 3, figsize=(20, 15))

# Training curves for each model
for i, (name, history) in enumerate(reg_histories.items()):
    row = i // 3
    col = i % 3
    
    if row < 2:  # First 6 models
        ax = axes[row, col]
        
        # Plot training and validation curves
        epochs = range(1, len(history['loss']) + 1)
        
        ax.plot(epochs, history['accuracy'], 'b-', linewidth=2, label='Training Accuracy')
        ax.plot(epochs, history['val_accuracy'], 'r-', linewidth=2, label='Validation Accuracy')
        
        ax.set_title(f'{name.replace("_", " ").title()}\n'
                    f'Final Gap: {history["accuracy"][-1] - history["val_accuracy"][-1]:.3f}')
        ax.set_xlabel('Epoch')
        ax.set_ylabel('Accuracy')
        ax.legend()
        ax.grid(True, alpha=0.3)
        ax.set_ylim(0, 1)
        
        # Highlight overfitting region
        if history['accuracy'][-1] - history['val_accuracy'][-1] > 0.1:
            ax.axhspan(history['val_accuracy'][-1], history['accuracy'][-1], 
                      alpha=0.2, color='red', label='Overfitting Gap')

# Comparison plots in bottom row
# Test accuracy comparison
model_names = list(reg_models.keys())
test_accuracies = []
overfitting_gaps = []

for name in model_names:
    model = reg_models[name]
    history = reg_histories[name]
    
    test_loss, test_acc = model.evaluate(X_test_scaled_over, y_test_over, verbose=0)
    test_accuracies.append(test_acc)
    
    gap = history['accuracy'][-1] - history['val_accuracy'][-1]
    overfitting_gaps.append(gap)

# Plot 7: Test accuracy comparison
x_pos = np.arange(len(model_names))
bars = axes[2, 0].bar(x_pos, test_accuracies, 
                     color=['red' if gap > 0.1 else 'green' for gap in overfitting_gaps],
                     alpha=0.7)
axes[2, 0].set_title('Test Accuracy by Regularization Method')
axes[2, 0].set_xlabel('Regularization Method')
axes[2, 0].set_ylabel('Test Accuracy')
axes[2, 0].set_xticks(x_pos)
axes[2, 0].set_xticklabels([name.replace('_', '\n') for name in model_names], fontsize=8)
axes[2, 0].grid(True, alpha=0.3)
axes[2, 0].set_ylim(0, 1)

# Add accuracy values on bars
for bar, acc in zip(bars, test_accuracies):
    axes[2, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                   f'{acc:.3f}', ha='center', va='bottom', fontsize=8)

# Plot 8: Overfitting gap comparison
bars2 = axes[2, 1].bar(x_pos, overfitting_gaps,
                      color=['red' if gap > 0.1 else 'orange' if gap > 0.05 else 'green' 
                            for gap in overfitting_gaps],
                      alpha=0.7)
axes[2, 1].set_title('Overfitting Gap (Train - Val Accuracy)')
axes[2, 1].set_xlabel('Regularization Method')
axes[2, 1].set_ylabel('Overfitting Gap')
axes[2, 1].set_xticks(x_pos)
axes[2, 1].set_xticklabels([name.replace('_', '\n') for name in model_names], fontsize=8)
axes[2, 1].axhline(y=0.05, color='orange', linestyle='--', alpha=0.7, label='Moderate (0.05)')
axes[2, 1].axhline(y=0.1, color='red', linestyle='--', alpha=0.7, label='High (0.10)')
axes[2, 1].legend()
axes[2, 1].grid(True, alpha=0.3)

# Add gap values on bars
for bar, gap in zip(bars2, overfitting_gaps):
    axes[2, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                   f'{gap:.3f}', ha='center', va='bottom', fontsize=8)

# Plot 9: Training efficiency (epochs to convergence)
epochs_to_converge = [len(reg_histories[name]['loss']) for name in model_names]
bars3 = axes[2, 2].bar(x_pos, epochs_to_converge, color='skyblue', alpha=0.7)
axes[2, 2].set_title('Training Efficiency (Epochs to Convergence)')
axes[2, 2].set_xlabel('Regularization Method')
axes[2, 2].set_ylabel('Epochs')
axes[2, 2].set_xticks(x_pos)
axes[2, 2].set_xticklabels([name.replace('_', '\n') for name in model_names], fontsize=8)
axes[2, 2].grid(True, alpha=0.3)

# Add epoch values on bars
for bar, epochs in zip(bars3, epochs_to_converge):
    axes[2, 2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                   f'{epochs}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

# Detailed analysis and recommendations
print(f"\n🔍 REGULARIZATION ANALYSIS:")

# Find best regularization method
best_reg_idx = np.argmax(test_accuracies)
best_reg_name = model_names[best_reg_idx]
best_reg_acc = test_accuracies[best_reg_idx]
best_reg_gap = overfitting_gaps[best_reg_idx]

print(f"\n🏆 BEST REGULARIZATION METHOD: {best_reg_name.upper()}")
print(f"  Test accuracy: {best_reg_acc:.4f}")
print(f"  Overfitting gap: {best_reg_gap:.4f}")
print(f"  Epochs to converge: {epochs_to_converge[best_reg_idx]}")

# Identify worst overfitter
worst_overfitter_idx = np.argmax(overfitting_gaps)
worst_overfitter_name = model_names[worst_overfitter_idx]

print(f"\n🔴 WORST OVERFITTER: {worst_overfitter_name.upper()}")
print(f"  Overfitting gap: {overfitting_gaps[worst_overfitter_idx]:.4f}")
print(f"  Test accuracy: {test_accuracies[worst_overfitter_idx]:.4f}")

print(f"\n💡 REGULARIZATION INSIGHTS:")
print(f"\n1. DROPOUT EFFECTIVENESS:")
dropout_gap = overfitting_gaps[model_names.index('dropout')]
no_reg_gap = overfitting_gaps[model_names.index('no_reg')]
print(f"   • Reduced overfitting gap by {no_reg_gap - dropout_gap:.3f}")
print(f"   • Works by preventing neuron co-adaptation")
print(f"   • Only active during training, disabled during inference")

print(f"\n2. WEIGHT REGULARIZATION (L1/L2):")
l2_gap = overfitting_gaps[model_names.index('l2_reg')]
l1_gap = overfitting_gaps[model_names.index('l1_reg')]
print(f"   • L2 regularization gap: {l2_gap:.3f} (encourages small weights)")
print(f"   • L1 regularization gap: {l1_gap:.3f} (encourages sparse weights)")
print(f"   • Both add penalty to loss function for large weights")

print(f"\n3. BATCH NORMALIZATION:")
bn_gap = overfitting_gaps[model_names.index('batch_norm')]
print(f"   • Overfitting gap: {bn_gap:.3f}")
print(f"   • Primary purpose: Stabilize training (regularization is side effect)")
print(f"   • Normalizes inputs to each layer")

print(f"\n4. COMBINED APPROACH:")
combined_gap = overfitting_gaps[model_names.index('combined')]
combined_acc = test_accuracies[model_names.index('combined')]
print(f"   • Overfitting gap: {combined_gap:.3f}")
print(f"   • Test accuracy: {combined_acc:.3f}")
print(f"   • Multiple regularization techniques can work together")
print(f"   • But may slow training and require hyperparameter tuning")

print(f"\n✨ PRACTICAL RECOMMENDATIONS:")
print(f"\n1. START SIMPLE:")
print(f"   • Begin with dropout (0.2-0.5 rate)")
print(f"   • Add early stopping (always recommended)")
print(f"   • Monitor train vs validation curves")

print(f"\n2. ADD COMPLEXITY GRADUALLY:")
print(f"   • If still overfitting, add L2 regularization (0.001-0.01)")
print(f"   • Consider batch normalization for deep networks")
print(f"   • L1 regularization for feature selection (sparse weights)")

print(f"\n3. HYPERPARAMETER TUNING:")
print(f"   • Dropout rate: Higher for more regularization")
print(f"   • L2 lambda: Start small (0.001), increase if needed")
print(f"   • Always validate on held-out data")

print(f"\n4. SIGNS OF GOOD REGULARIZATION:")
print(f"   • Training and validation curves stay close together")
print(f"   • Validation accuracy doesn't start declining")
print(f"   • Test accuracy similar to validation accuracy")

## 5. Real-World Application: Complete Project Walkthrough

**Project: Wine Quality Classification**
- **Domain**: Food & Beverage Industry
- **Problem**: Predict wine quality from chemical properties
- **Type**: Multi-class classification
- **Goal**: Build production-ready model with proper evaluation

**End-to-End Pipeline:**
1. **Data Loading & Exploration**: Understand the dataset
2. **Feature Engineering**: Improve input representations
3. **Model Design**: Architecture selection and justification
4. **Training Strategy**: Proper validation and hyperparameter tuning
5. **Evaluation**: Comprehensive performance analysis
6. **Production Considerations**: Model deployment readiness

**This section demonstrates industry best practices for neural network projects.**

In [None]:
# Cell 6: Complete Real-World Project - Wine Quality Classification

print("=== REAL-WORLD PROJECT: WINE QUALITY CLASSIFICATION ===")

# Step 1: Data Loading and Exploration
print(f"\n📊 STEP 1: DATA LOADING AND EXPLORATION")

# Load wine dataset
wine_data = load_wine()
X_wine = wine_data.data
y_wine = wine_data.target
feature_names = wine_data.feature_names
target_names = wine_data.target_names

print(f"Dataset Overview:")
print(f"  Samples: {X_wine.shape[0]}")
print(f"  Features: {X_wine.shape[1]}")
print(f"  Classes: {len(target_names)} ({', '.join(target_names)})")
print(f"  Problem type: Multi-class classification")

# Convert to DataFrame for easier analysis
wine_df = pd.DataFrame(X_wine, columns=feature_names)
wine_df['quality'] = y_wine

print(f"\nFeature Information:")
for i, feature in enumerate(feature_names[:5]):  # Show first 5 features
    print(f"  {i+1:2d}. {feature:20s}: {wine_df[feature].min():.2f} - {wine_df[feature].max():.2f}")
print(f"  ... and {len(feature_names)-5} more features")

# Class distribution
class_counts = np.bincount(y_wine)
print(f"\nClass Distribution:")
for i, (name, count) in enumerate(zip(target_names, class_counts)):
    percentage = count / len(y_wine) * 100
    print(f"  {name:15s}: {count:3d} samples ({percentage:5.1f}%)")

# Check for class imbalance
imbalance_ratio = max(class_counts) / min(class_counts)
print(f"\nClass imbalance ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 3:
    print(f"⚠️  Significant class imbalance detected! Consider techniques like class weighting.")
else:
    print(f"✅ Class distribution is reasonably balanced.")

# Step 2: Feature Engineering and Analysis
print(f"\n🔧 STEP 2: FEATURE ENGINEERING")

# Correlation analysis
correlation_matrix = wine_df.corr()
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = abs(correlation_matrix.iloc[i, j])
        if corr_val > 0.8 and corr_val < 1.0:  # High correlation but not perfect
            high_corr_pairs.append((correlation_matrix.columns[i], 
                                  correlation_matrix.columns[j], corr_val))

print(f"High correlation analysis:")
if high_corr_pairs:
    print(f"  Found {len(high_corr_pairs)} highly correlated feature pairs (|r| > 0.8):")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"    {feat1} <-> {feat2}: {corr:.3f}")
    print(f"  💡 Consider feature selection or dimensionality reduction")
else:
    print(f"  ✅ No highly correlated features found")

# Feature scaling analysis
feature_scales = wine_df.iloc[:, :-1].std()
scale_ratio = feature_scales.max() / feature_scales.min()
print(f"\nFeature scaling analysis:")
print(f"  Largest std: {feature_scales.max():.2f} ({feature_scales.idxmax()})")
print(f"  Smallest std: {feature_scales.min():.2f} ({feature_scales.idxmin()})")
print(f"  Scale ratio: {scale_ratio:.1f}:1")
if scale_ratio > 10:
    print(f"  ⚠️  Large scale differences - normalization strongly recommended")
else:
    print(f"  ℹ️  Moderate scale differences - normalization recommended")

# Split data
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    X_wine, y_wine, test_size=0.2, random_state=RANDOM_SEED, stratify=y_wine
)

# Scale features
scaler_wine = StandardScaler()
X_train_wine_scaled = scaler_wine.fit_transform(X_train_wine)
X_test_wine_scaled = scaler_wine.transform(X_test_wine)

print(f"\nData preparation:")
print(f"  Training samples: {len(X_train_wine)} (80%)")
print(f"  Test samples: {len(X_test_wine)} (20%)")
print(f"  Features scaled: Mean ≈ 0, Std ≈ 1")

# Step 3: Model Architecture Design
print(f"\n🏗️ STEP 3: MODEL ARCHITECTURE DESIGN")

def create_wine_model(architecture_type='optimal'):
    """Create different architectures for wine classification"""
    
    if architecture_type == 'simple':
        # Simple baseline
        model = models.Sequential([
            layers.Input(shape=(13,)),  # 13 wine features
            layers.Dense(16, activation='relu'),
            layers.Dense(3, activation='softmax')  # 3 wine classes
        ], name='simple_wine_classifier')
        
    elif architecture_type == 'optimal':
        # Well-designed architecture for this problem size
        model = models.Sequential([
            layers.Input(shape=(13,)),
            layers.Dense(64, activation='relu', kernel_initializer='he_normal'),
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            
            layers.Dense(32, activation='relu', kernel_initializer='he_normal'),
            layers.BatchNormalization(),
            layers.Dropout(0.2),
            
            layers.Dense(16, activation='relu', kernel_initializer='he_normal'),
            layers.Dropout(0.1),
            
            layers.Dense(3, activation='softmax')
        ], name='optimal_wine_classifier')
        
    elif architecture_type == 'complex':
        # More complex (possibly overkill for this dataset)
        model = models.Sequential([
            layers.Input(shape=(13,)),
            layers.Dense(128, activation='relu', kernel_initializer='he_normal'),
            layers.BatchNormalization(),
            layers.Dropout(0.4),
            
            layers.Dense(64, activation='relu', kernel_initializer='he_normal'),
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            
            layers.Dense(32, activation='relu', kernel_initializer='he_normal'),
            layers.BatchNormalization(),
            layers.Dropout(0.2),
            
            layers.Dense(16, activation='relu', kernel_initializer='he_normal'),
            layers.Dropout(0.1),
            
            layers.Dense(3, activation='softmax')
        ], name='complex_wine_classifier')
    
    return model

# Create and compare different architectures
wine_models = {
    'simple': create_wine_model('simple'),
    'optimal': create_wine_model('optimal'),
    'complex': create_wine_model('complex')
}

print(f"Architecture comparison:")
for name, model in wine_models.items():
    print(f"\n{name.upper()} MODEL:")
    print(f"  Layers: {len(model.layers)}")
    print(f"  Parameters: {model.count_params():,}")
    print(f"  Complexity: {'Low' if model.count_params() < 1000 else 'Medium' if model.count_params() < 5000 else 'High'}")

# Justify architecture choice
dataset_size = len(X_train_wine)
features_count = X_wine.shape[1]
classes_count = len(np.unique(y_wine))

print(f"\nArchitecture Design Justification:")
print(f"  Dataset size: {dataset_size} samples")
print(f"  Feature count: {features_count}")
print(f"  Classes: {classes_count}")
print(f"  Recommendation: OPTIMAL architecture")
print(f"  Reasoning:")
print(f"    • Medium dataset size → moderate complexity needed")
print(f"    • 13 features → not too high-dimensional")
print(f"    • 3 classes → straightforward multi-class problem")
print(f"    • Regularization included to prevent overfitting")

# Step 4: Training Strategy
print(f"\n🚂 STEP 4: TRAINING STRATEGY")

# Compile models with different optimizers for comparison
optimizers_to_test = {
    'adam_001': optimizers.Adam(learning_rate=0.001),
    'adam_01': optimizers.Adam(learning_rate=0.01),
    'sgd_momentum': optimizers.SGD(learning_rate=0.01, momentum=0.9),
}

# Use the optimal model for hyperparameter comparison
best_model = None
best_accuracy = 0
best_config = None

print(f"\nHyperparameter optimization:")
for opt_name, optimizer in optimizers_to_test.items():
    print(f"\nTesting optimizer: {opt_name}")
    
    # Create fresh model
    model = create_wine_model('optimal')
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Training callbacks
    early_stop = callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=20,
        restore_best_weights=True
    )
    
    reduce_lr = callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=10,
        min_lr=1e-6
    )
    
    # Train model
    history = model.fit(
        X_train_wine_scaled, y_train_wine,
        epochs=200,
        batch_size=16,
        validation_split=0.2,
        callbacks=[early_stop, reduce_lr],
        verbose=0
    )
    
    # Evaluate
    val_accuracy = max(history.history['val_accuracy'])
    test_loss, test_accuracy = model.evaluate(X_test_wine_scaled, y_test_wine, verbose=0)
    epochs_trained = len(history.history['loss'])
    
    print(f"  Best val accuracy: {val_accuracy:.4f}")
    print(f"  Test accuracy: {test_accuracy:.4f}")
    print(f"  Epochs trained: {epochs_trained}")
    
    # Track best model
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_model = model
        best_config = opt_name

print(f"\n🏆 BEST CONFIGURATION: {best_config}")
print(f"  Test accuracy: {best_accuracy:.4f}")

# Step 5: Comprehensive Evaluation
print(f"\n📊 STEP 5: COMPREHENSIVE EVALUATION")

# Predictions and metrics
y_pred_wine = best_model.predict(X_test_wine_scaled)
y_pred_wine_classes = np.argmax(y_pred_wine, axis=1)

# Confusion matrix
cm_wine = confusion_matrix(y_test_wine, y_pred_wine_classes)
classification_rep = classification_report(y_test_wine, y_pred_wine_classes, 
                                         target_names=target_names, 
                                         output_dict=True)

print(f"\nClassification Results:")
print(f"  Overall Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.1f}%)")
print(f"  Macro Average F1: {classification_rep['macro avg']['f1-score']:.4f}")
print(f"  Weighted Average F1: {classification_rep['weighted avg']['f1-score']:.4f}")

print(f"\nPer-Class Performance:")
for i, class_name in enumerate(target_names):
    precision = classification_rep[str(i)]['precision']
    recall = classification_rep[str(i)]['recall']
    f1 = classification_rep[str(i)]['f1-score']
    support = classification_rep[str(i)]['support']
    
    print(f"  {class_name:15s}: P={precision:.3f}, R={recall:.3f}, F1={f1:.3f} (n={support:2.0f})")

# Visualization of results
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Confusion Matrix
sns.heatmap(cm_wine, annot=True, fmt='d', cmap='Blues', 
           xticklabels=target_names, yticklabels=target_names, ax=axes[0, 0])
axes[0, 0].set_title('Confusion Matrix')
axes[0, 0].set_ylabel('True Label')
axes[0, 0].set_xlabel('Predicted Label')

# Plot 2: Feature Importance (approximate using model weights)
# Get weights from first layer
first_layer_weights = best_model.layers[0].get_weights()[0]
feature_importance = np.abs(first_layer_weights).mean(axis=1)
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=True)

axes[0, 1].barh(range(len(feature_importance_df)), feature_importance_df['importance'])
axes[0, 1].set_yticks(range(len(feature_importance_df)))
axes[0, 1].set_yticklabels(feature_importance_df['feature'], fontsize=8)
axes[0, 1].set_title('Feature Importance (Approximate)')
axes[0, 1].set_xlabel('Average Absolute Weight')

# Plot 3: Prediction Confidence Distribution
max_probs = np.max(y_pred_wine, axis=1)
axes[1, 0].hist(max_probs, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[1, 0].set_title('Prediction Confidence Distribution')
axes[1, 0].set_xlabel('Maximum Predicted Probability')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(x=0.8, color='red', linestyle='--', label='High Confidence Threshold')
axes[1, 0].legend()

# Plot 4: Per-Class Metrics
metrics = ['precision', 'recall', 'f1-score']
class_metrics = np.array([[classification_rep[str(i)][metric] for metric in metrics] 
                         for i in range(len(target_names))])

x = np.arange(len(target_names))
width = 0.25

for i, metric in enumerate(metrics):
    axes[1, 1].bar(x + i*width, class_metrics[:, i], width, 
                  label=metric.capitalize(), alpha=0.8)

axes[1, 1].set_title('Per-Class Performance Metrics')
axes[1, 1].set_xlabel('Wine Class')
axes[1, 1].set_ylabel('Score')
axes[1, 1].set_xticks(x + width)
axes[1, 1].set_xticklabels(target_names)
axes[1, 1].legend()
axes[1, 1].set_ylim(0, 1)

plt.tight_layout()
plt.show()

# Step 6: Production Readiness Assessment
print(f"\n🚀 STEP 6: PRODUCTION READINESS ASSESSMENT")

# Model confidence analysis
high_confidence_threshold = 0.8
high_confidence_predictions = np.sum(max_probs > high_confidence_threshold)
high_conf_percentage = high_confidence_predictions / len(max_probs) * 100

print(f"\nModel Confidence Analysis:")
print(f"  High confidence predictions (>{high_confidence_threshold}): {high_confidence_predictions}/{len(max_probs)} ({high_conf_percentage:.1f}%)")
print(f"  Average confidence: {np.mean(max_probs):.3f}")
print(f"  Minimum confidence: {np.min(max_probs):.3f}")

# Performance on different confidence levels
confidence_levels = [0.5, 0.7, 0.8, 0.9]
print(f"\nAccuracy at different confidence thresholds:")
for threshold in confidence_levels:
    mask = max_probs > threshold
    if np.sum(mask) > 0:
        high_conf_accuracy = np.mean(y_pred_wine_classes[mask] == y_test_wine[mask])
        coverage = np.mean(mask) * 100
        print(f"  Confidence > {threshold}: {high_conf_accuracy:.3f} accuracy, {coverage:.1f}% coverage")
    else:
        print(f"  Confidence > {threshold}: No predictions above threshold")

# Model size and inference speed
model_size_mb = best_model.count_params() * 4 / (1024 * 1024)  # Assuming float32
print(f"\nModel Deployment Characteristics:")
print(f"  Parameters: {best_model.count_params():,}")
print(f"  Estimated size: {model_size_mb:.2f} MB (float32)")
print(f"  Input features: {X_wine.shape[1]}")
print(f"  Output classes: {len(target_names)}")
print(f"  Preprocessing required: Feature scaling (StandardScaler)")

# Business impact analysis
print(f"\n💼 BUSINESS IMPACT ANALYSIS:")
print(f"\nModel Performance Summary:")
print(f"  ✅ Overall accuracy: {best_accuracy*100:.1f}% (industry benchmark varies)")
print(f"  ✅ Balanced performance across all wine classes")
print(f"  ✅ High confidence predictions: {high_conf_percentage:.1f}%")
print(f"  ✅ Lightweight model suitable for production deployment")

print(f"\nPotential Use Cases:")
print(f"  🍷 Quality control in wine production")
print(f"  📊 Batch classification for inventory management")
print(f"  🎯 Real-time quality assessment in production line")
print(f"  📈 Quality prediction for new wine batches")

print(f"\n⚠️  Production Considerations:")
print(f"  • Model retraining: Recommended quarterly with new data")
print(f"  • Monitoring: Track prediction confidence and drift in input features")
print(f"  • Validation: Regular comparison with expert wine tasters")
print(f"  • Fallback: Manual inspection for low-confidence predictions")

print(f"\n🎉 PROJECT COMPLETION SUMMARY:")
print(f"✅ Successfully built end-to-end wine classification system")
print(f"✅ Achieved {best_accuracy*100:.1f}% accuracy on test set")
print(f"✅ Implemented proper regularization and validation")
print(f"✅ Conducted comprehensive evaluation and error analysis")
print(f"✅ Assessed production readiness and business impact")
print(f"\n🎓 You've completed a professional-grade machine learning project!")