# Character-Level Bi-LSTM Shakespeare Text Generation: Test Suite

**Purpose**: Comprehensive testing and evaluation notebook supporting both:
1. Training models from scratch
2. Testing pre-trained models

**Model Architecture**:
- Embedding: 128 dim (70 char vocab)
- Bi-LSTM 1: 256 units with return_sequences=True
- Bi-LSTM 2: 128 units with return_sequences=False
- Dense: 256 units + ReLU activation
- Output: 70 units (character vocab) + Softmax

**Key Metrics**:
- Test Accuracy: 49.90% (target: 70%)
- Perplexity: 5.69 (excellent, <10)
- Top-5 Accuracy: 80.40%

In [None]:
"""Import required libraries and set configuration"""

# Core libraries
import os
import sys
import numpy as np
import pandas as pd
import pickle
import time
import re
from collections import Counter
from pathlib import Path

# TensorFlow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (
    LSTM, Dense, Dropout, Bidirectional, Embedding
)
from tensorflow.keras.callbacks import (
    EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import SparseTopKCategoricalAccuracy

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec
import seaborn as sns

# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Print library versions
print(f"TensorFlow version: {tf.__version__}")
print(f"Python version: {sys.version}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Configuration
CONFIG = {
    'sequence_length': 100,
    'batch_size': 512,
    'embedding_dim': 128,
    'lstm_units': 256,
    'lstm_units_2': 128,
    'dense_units': 256,
    'dropout_rate': 0.2,
    'epochs': 30,
    'validation_split': 0.15,
    'learning_rate': 0.002,
}

print(f"\n‚úì Configuration loaded: {CONFIG}")

In [None]:
"""Helper functions for character-level data preprocessing"""

def load_shakespeare_text(file_path='Shakespeare.txt'):
    """Load raw Shakespeare text"""
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

def create_char_mappings(text):
    """Create character-to-index and index-to-character mappings"""
    chars = sorted(list(set(text)))
    char_to_idx = {c: i for i, c in enumerate(chars)}
    idx_to_char = {i: c for i, c in enumerate(chars)}
    return char_to_idx, idx_to_char

def create_training_sequences(text, char_to_idx, seq_length=100, step=3):
    """Create (input sequence, target character) pairs for training
    
    Args:
        text: Raw text string
        char_to_idx: Character to index mapping
        seq_length: Length of input sequences
        step: Step size for sliding window (3 = faster training)
    
    Returns:
        X: Input sequences [N, seq_length]
        y: Target characters [N]
    """
    X = []
    y = []
    
    for i in range(0, len(text) - seq_length, step):
        sequence = text[i:i + seq_length]
        target = text[i + seq_length]
        
        try:
            X.append([char_to_idx[c] for c in sequence])
            y.append(char_to_idx[target])
        except KeyError:
            # Skip if character not in mapping
            continue
    
    return np.array(X), np.array(y)

def load_or_create_char_mappings(model_mappings_path='models/char_mappings.pkl',
                                  text_file='Shakespeare.txt'):
    """Load pre-trained character mappings or create new ones"""
    
    if os.path.exists(model_mappings_path):
        with open(model_mappings_path, 'rb') as f:
            mappings = pickle.load(f)
            return mappings['char_to_idx'], mappings['idx_to_char']
    else:
        text = load_shakespeare_text(text_file)
        char_to_idx, idx_to_char = create_char_mappings(text)
        return char_to_idx, idx_to_char

def save_char_mappings(char_to_idx, idx_to_char, save_path='models/char_mappings.pkl'):
    """Save character mappings for later use"""
    os.makedirs('models', exist_ok=True)
    with open(save_path, 'wb') as f:
        pickle.dump({
            'char_to_idx': char_to_idx,
            'idx_to_char': idx_to_char
        }, f)
    print(f"‚úì Mappings saved to {save_path}")

print("‚úì Character-level preprocessing functions loaded")

## Section 1: Data Loading and Preparation

This section handles loading Shakespeare text, creating character mappings, 
and preparing training sequences.

In [None]:
"""Load Shakespeare text and create character mappings"""

print("="*70)
print("LOADING DATA")
print("="*70)

# Load text
text = load_shakespeare_text('Shakespeare.txt')
print(f"\n‚úì Loaded {len(text):,} characters from Shakespeare.txt")
print(f"  Sample: {text[:100]}")

# Create character mappings
char_to_idx, idx_to_char = create_char_mappings(text)
vocab_size = len(char_to_idx)

print(f"\n‚úì Character mappings created")
print(f"  Vocabulary size: {vocab_size}")
print(f"  Sample characters: {sorted(list(set(text)))[:30]}")
print(f"  Unique characters: {list(char_to_idx.items())[:10]}")

In [None]:
"""Create training sequences with specified configuration"""

print("\n" + "="*70)
print("CREATING TRAINING SEQUENCES")
print("="*70)

# Create sequences with step=3 for faster creation
X, y = create_training_sequences(
    text=text,
    char_to_idx=char_to_idx,
    seq_length=CONFIG['sequence_length'],
    step=3
)

print(f"\n‚úì Training sequences created")
print(f"  Total sequences: {len(X):,}")
print(f"  Input shape: {X.shape}")
print(f"  Output shape: {y.shape}")

# Train/validation split
split_idx = int(len(X) * (1 - CONFIG['validation_split']))
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

print(f"\n‚úì Data split completed")
print(f"  Training set: {len(X_train):,} sequences ({100*(1-CONFIG['validation_split']):.1f}%)")
print(f"  Validation set: {len(X_val):,} sequences ({100*CONFIG['validation_split']:.1f}%)")

# Store for later use
DATA_SHAPES = {
    'X_train': X_train.shape,
    'X_val': X_val.shape,
    'vocab_size': vocab_size,
    'sequence_length': CONFIG['sequence_length']
}

## Section 2: Model Training

Two pathways:
1. **Train from scratch**: Full character-level Bi-LSTM training
2. **Load pre-trained**: Use existing models/char_bilstm_best.h5

Choose based on your requirements and available compute resources.

In [None]:
"""Helper function to build character-level Bi-LSTM model"""

def build_char_bilstm_model(vocab_size, sequence_length=100):
    """Build character-level Bi-LSTM model matching train_char_level.py
    
    Args:
        vocab_size: Size of character vocabulary
        sequence_length: Length of input sequences
    
    Returns:
        Compiled Keras model
    """
    model = Sequential([
        # Embedding layer
        Embedding(
            vocab_size, 
            CONFIG['embedding_dim'],
            input_length=sequence_length,
            name='embedding'
        ),
        
        # First Bidirectional LSTM
        Bidirectional(
            LSTM(
                CONFIG['lstm_units'],
                return_sequences=True,
                dropout=CONFIG['dropout_rate'],
                recurrent_dropout=CONFIG['dropout_rate'],
                name='lstm_1'
            ),
            name='bidirectional_1'
        ),
        
        # Second Bidirectional LSTM
        Bidirectional(
            LSTM(
                CONFIG['lstm_units_2'],
                return_sequences=False,
                dropout=CONFIG['dropout_rate'],
                recurrent_dropout=CONFIG['dropout_rate'],
                name='lstm_2'
            ),
            name='bidirectional_2'
        ),
        
        # Dropout after LSTM
        Dropout(0.3, name='dropout_lstm'),
        
        # Dense hidden layer
        Dense(
            CONFIG['dense_units'],
            activation='relu',
            name='dense_hidden'
        ),
        
        # Final dropout
        Dropout(0.3, name='dropout_final'),
        
        # Output layer (character probability distribution)
        Dense(
            vocab_size,
            activation='softmax',
            name='output'
        )
    ], name='CharLevel_BiLSTM')
    
    return model

# Compile helper
def compile_char_model(model, learning_rate=None):
    """Compile character-level model"""
    if learning_rate is None:
        learning_rate = CONFIG['learning_rate']
    
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=[
            'accuracy',
            SparseTopKCategoricalAccuracy(k=5, name='top5_acc')
        ]
    )
    return model

print("‚úì Model building functions ready")

In [None]:
"""OPTION 1: Train character-level Bi-LSTM from scratch"""

TRAIN_FROM_SCRATCH = False  # <-- SET TO TRUE TO TRAIN

if TRAIN_FROM_SCRATCH:
    print("="*70)
    print("TRAINING FROM SCRATCH")
    print("="*70)
    
    # Build fresh model
    print("\nüèóÔ∏è Building fresh Bi-LSTM model...")
    model = build_char_bilstm_model(vocab_size, CONFIG['sequence_length'])
    model = compile_char_model(model)
    
    print("\nüìã Model Summary:")
    model.summary()
    
    # Setup callbacks
    os.makedirs('models', exist_ok=True)
    os.makedirs('logs/char_level', exist_ok=True)
    
    callbacks = [
        EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=1,
            mode='max'
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=2,
            min_lr=1e-6,
            verbose=1
        ),
        ModelCheckpoint(
            'models/char_bilstm_best_fresh.h5',
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1,
            mode='max'
        )
    ]
    
    # Train
    print("\nüöÄ Starting training...")
    print("-"*70)
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=CONFIG['batch_size'],
        epochs=CONFIG['epochs'],
        callbacks=callbacks,
        verbose=1
    )
    
    # Save character mappings
    save_char_mappings(char_to_idx, idx_to_char, 
                      'models/char_mappings_fresh.pkl')
    
    print("\n‚úÖ Training completed!")
    TRAINED_MODEL = model
    TRAINED_FROM_SCRATCH = True
else:
    print("‚è≠Ô∏è  TRAIN_FROM_SCRATCH = False - skipping training")
    print("   Set TRAIN_FROM_SCRATCH = True in cell 9 to enable training")
    TRAINED_FROM_SCRATCH = False

In [None]:
"""OPTION 2: Load pre-trained character-level model"""

LOAD_PRETRAINED = True  # <-- SET TO FALSE TO SKIP

if LOAD_PRETRAINED:
    print("="*70)
    print("LOADING PRE-TRAINED MODEL")
    print("="*70)
    
    model_path = 'models/char_bilstm_best.h5'
    
    if os.path.exists(model_path):
        print(f"\nüì¶ Loading model from {model_path}...")
        model = load_model(model_path)
        print("‚úì Model loaded successfully")
        
        # Verify model structure
        print("\nüìã Loaded Model Summary:")
        model.summary()
        
        TRAINED_MODEL = model
        PRETRAINED_LOADED = True
    else:
        print(f"‚úó Model not found at {model_path}")
        print("  Please train from scratch or verify model path")
        PRETRAINED_LOADED = False
else:
    print("‚è≠Ô∏è  LOAD_PRETRAINED = False - skipping pretrained model loading")
    PRETRAINED_LOADED = False

# Ensure we have a model
assert (TRAINED_FROM_SCRATCH or PRETRAINED_LOADED), \
    "No model available! Set TRAIN_FROM_SCRATCH or LOAD_PRETRAINED to True"

print("\n‚úì Model ready for testing!")

## Section 3: Quantitative Evaluation

Metrics evaluated:
- **Accuracy**: Top-1 character prediction accuracy
- **Top-5 Accuracy**: Correct character in top-5 predictions
- **Loss**: Sparse categorical crossentropy
- **Perplexity**: Exp(loss), measure of model confidence

In [None]:
"""Prepare test set for quantitative evaluation"""

print("="*70)
print("PREPARING TEST SET")
print("="*70)

# Use last 15% of text as test set (unseen during training)
test_start = int(len(text) * 0.85)
test_text = text[test_start:]

print(f"\n‚úì Test set prepared")
print(f"  Test set size: {len(test_text):,} characters")
print(f"  Sample: {test_text[:100]}")

# Create test sequences
X_test, y_test = create_training_sequences(
    text=test_text,
    char_to_idx=char_to_idx,
    seq_length=CONFIG['sequence_length'],
    step=10  # Larger step for test efficiency
)

print(f"\n‚úì Test sequences created")
print(f"  Total test sequences: {len(X_test):,}")
print(f"  Test shape: {X_test.shape}")

In [None]:
"""Quantitative evaluation on test set"""

print("\n" + "="*70)
print("QUANTITATIVE EVALUATION")
print("="*70)

# Evaluate
print("\nüìä Evaluating model on test set...")
start_time = time.time()

results = TRAINED_MODEL.evaluate(
    X_test, y_test,
    batch_size=CONFIG['batch_size'],
    verbose=0
)

eval_time = time.time() - start_time

# Parse results
test_loss = results[0]
test_acc = results[1]
test_top5 = results[2]

print(f"\n‚úì Evaluation completed in {eval_time:.2f}s")

# Display metrics
print(f"\n{'‚îÄ'*70}")
print(f"{'METRIC':<30} {'VALUE':>15} {'TARGET':>15}")
print(f"{'‚îÄ'*70}")
print(f"{'Accuracy':<30} {test_acc*100:>14.2f}% {'>70%':>15}")
print(f"{'Top-5 Accuracy':<30} {test_top5*100:>14.2f}% {'>80%':>15}")
print(f"{'Loss':<30} {test_loss:>15.4f} {'<2.0':>15}")
print(f"{'‚îÄ'*70}")

# Calculate perplexity
perplexity = np.exp(test_loss)
print(f"{'Perplexity':<30} {perplexity:>15.2f} {'<10':>15}")
print(f"{'‚îÄ'*70}")

# Store results for later
EVAL_RESULTS = {
    'test_accuracy': test_acc,
    'test_top5_accuracy': test_top5,
    'test_loss': test_loss,
    'perplexity': perplexity,
    'eval_time': eval_time
}

# Assessment
print(f"\nüìà Assessment:")
if test_acc >= 0.70:
    print(f"  üåü EXCELLENT - Exceeds 70% accuracy target!")
elif test_acc >= 0.60:
    print(f"  ‚úÖ GOOD - Strong performance, {(0.70-test_acc)*100:.1f}% below target")
elif test_acc >= 0.50:
    print(f"  ‚ö†Ô∏è  MODERATE - {(0.70-test_acc)*100:.1f}% below target, more training needed")
else:
    print(f"  ‚ùå NEEDS IMPROVEMENT - {(0.70-test_acc)*100:.1f}% below target")

## Section 4: Inference Speed Benchmarking

Evaluate model inference performance across different batch sizes.

In [None]:
"""Benchmark inference speed across different batch sizes"""

print("\n" + "="*70)
print("INFERENCE SPEED BENCHMARKING")
print("="*70)

benchmark_results = {}

# Test different batch sizes
batch_sizes = [1, 32, 64, 128, 256]

for batch_size in batch_sizes:
    if len(X_test) < batch_size:
        print(f"\n‚è≠Ô∏è  Skipping batch size {batch_size} (insufficient test samples)")
        continue
    
    print(f"\n‚ö° Testing batch size {batch_size}...")
    
    # Warmup
    for _ in range(10):
        _ = TRAINED_MODEL.predict(X_test[:batch_size], verbose=0)
    
    # Benchmark
    times = []
    for _ in range(20):  # Fewer runs for faster testing
        start = time.time()
        _ = TRAINED_MODEL.predict(X_test[:batch_size], verbose=0)
        times.append(time.time() - start)
    
    mean_time = np.mean(times)
    std_time = np.std(times)
    throughput = batch_size / mean_time
    per_sample = (mean_time * 1000) / batch_size
    
    benchmark_results[batch_size] = {
        'mean_ms': mean_time * 1000,
        'std_ms': std_time * 1000,
        'throughput': throughput,
        'per_sample_ms': per_sample
    }
    
    print(f"  ‚îú‚îÄ Mean time: {mean_time*1000:.2f} ¬± {std_time*1000:.2f} ms")
    print(f"  ‚îú‚îÄ Per sample: {per_sample:.3f} ms")
    print(f"  ‚îî‚îÄ Throughput: {throughput:.1f} samples/sec")

# Find optimal batch size
optimal_batch = max(benchmark_results.items(), 
                   key=lambda x: x[1]['throughput'])[0]

print(f"\nüéØ Optimal batch size: {optimal_batch}")
print(f"   Throughput: {benchmark_results[optimal_batch]['throughput']:.1f} samples/sec")

BENCHMARK_RESULTS = benchmark_results

In [None]:
"""Visualize inference speed benchmarking results"""

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Throughput
batch_sizes = sorted(list(BENCHMARK_RESULTS.keys()))
throughputs = [BENCHMARK_RESULTS[bs]['throughput'] for bs in batch_sizes]
latencies = [BENCHMARK_RESULTS[bs]['mean_ms'] for bs in batch_sizes]

axes[0].bar(range(len(batch_sizes)), throughputs, color='steelblue', alpha=0.8)
axes[0].set_xlabel('Batch Size', fontsize=12)
axes[0].set_ylabel('Throughput (samples/sec)', fontsize=12)
axes[0].set_title('Inference Throughput by Batch Size', fontsize=13, fontweight='bold')
axes[0].set_xticks(range(len(batch_sizes)))
axes[0].set_xticklabels(batch_sizes)
axes[0].grid(axis='y', alpha=0.3)

# Add values on bars
for i, (bs, tp) in enumerate(zip(batch_sizes, throughputs)):
    axes[0].text(i, tp + 2, f'{tp:.0f}', ha='center', fontsize=10)

# Plot 2: Latency
axes[1].plot(batch_sizes, latencies, marker='o', markersize=8, 
            linewidth=2, color='coral', label='Latency')
axes[1].set_xlabel('Batch Size', fontsize=12)
axes[1].set_ylabel('Mean Latency (ms)', fontsize=12)
axes[1].set_title('Inference Latency by Batch Size', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3)
axes[1].legend()

plt.tight_layout()
plt.savefig('benchmark_results.png', dpi=100, bbox_inches='tight')
plt.show()

print("‚úì Benchmark visualization saved to benchmark_results.png")

## Section 5: Text Generation Quality

Evaluate quality of generated text using:
1. **Temperature sampling**: Controls randomness (0.5 = conservative, 1.0 = standard)
2. **Multiple prompts**: Test with different Shakespeare phrases
3. **Qualitative analysis**: Word diversity, grammar, coherence
4. **Comparison with original**: Statistical comparison with real Shakespeare

In [None]:
"""Helper function for text generation with temperature sampling"""

def generate_text(model, seed_text, length=200, temperature=1.0, 
                 char_to_idx=None, idx_to_char=None, seq_length=100):
    """Generate text using character-level model with temperature sampling
    
    Args:
        model: Trained Keras model
        seed_text: Starting text prompt
        length: Number of characters to generate
        temperature: Sampling temperature (higher = more random)
        char_to_idx: Character to index mapping
        idx_to_char: Index to character mapping
        seq_length: Sequence length used in model
    
    Returns:
        Generated text string
    """
    generated = seed_text
    
    for _ in range(length):
        # Prepare input: take last seq_length characters
        current_seq = generated[-seq_length:]
        
        # Convert to indices
        try:
            x = np.array([[char_to_idx.get(c, 0) for c in current_seq]])
        except:
            x = np.array([[char_to_idx.get(c, 0) for c in current_seq[-seq_length:]]])
        
        # Predict next character probabilities
        predictions = model.predict(x, verbose=0)[0]
        
        # Apply temperature
        predictions = np.log(predictions + 1e-10) / temperature
        predictions = np.exp(predictions) / np.sum(np.exp(predictions))
        
        # Sample next character
        next_idx = np.random.choice(len(predictions), p=predictions)
        next_char = idx_to_char[next_idx]
        
        generated += next_char
    
    return generated

print("‚úì Text generation function ready")

In [None]:
"""Generate text samples with different temperatures"""

print("="*70)
print("TEXT GENERATION QUALITY")
print("="*70)

# Test prompts from Shakespeare
test_prompts = [
    "To be or not to be",
    "O Romeo, Romeo",
    "Friends, Romans, countrymen",
    "All the world's a stage",
    "Now is the winter"
]

# Temperatures to test
temperatures = [0.5, 0.8, 1.0, 1.2]

generated_samples = {}

for temp in temperatures:
    print(f"\n{'‚îÄ'*70}")
    print(f"Temperature: {temp} (conservative={temp<1.0}, standard={temp==1.0})")
    print(f"{'‚îÄ'*70}")
    
    generated_samples[temp] = {}
    
    for i, prompt in enumerate(test_prompts):
        # Generate text
        generated = generate_text(
            model=TRAINED_MODEL,
            seed_text=prompt,
            length=150,
            temperature=temp,
            char_to_idx=char_to_idx,
            idx_to_char=idx_to_char,
            seq_length=CONFIG['sequence_length']
        )
        
        generated_samples[temp][prompt] = generated
        
        print(f"\nüìç Prompt {i+1}: '{prompt}'")
        print(f"Generated:")
        print(generated[:200])  # Show first 200 chars
        print()

print("‚úì Text generation samples created")

In [None]:
"""Helper functions for analyzing generated text quality"""

def analyze_text(text):
    """Analyze text quality metrics
    
    Returns dictionary with:
    - word_count: Total words
    - unique_words: Unique words
    - vocabulary_diversity: unique_words / word_count
    - char_distribution: Counter of characters
    - sentences: Count of sentence endings
    - avg_word_length: Average characters per word
    - lines: Count of line breaks
    """
    # Word extraction
    words = re.findall(r'\b\w+\b', text.lower())
    word_count = len(words)
    unique_words = len(set(words))
    
    # Character distribution
    char_dist = Counter(text)
    
    # Sentence-like structures
    sentences = text.count('.') + text.count('!') + text.count('?')
    
    # Line breaks
    lines = text.count('\n')
    
    # Average word length
    avg_word_len = np.mean([len(w) for w in words]) if words else 0
    
    return {
        'word_count': word_count,
        'unique_words': unique_words,
        'vocabulary_diversity': unique_words / word_count if word_count > 0 else 0,
        'char_distribution': char_dist,
        'sentences': sentences,
        'avg_word_length': avg_word_len,
        'lines': lines
    }

print("‚úì Text analysis function ready")

In [None]:
"""Analyze quality of generated text"""

print("\n" + "="*70)
print("QUALITATIVE ANALYSIS")
print("="*70)

# Generate longer sample for analysis
print("\nüî¨ Generating extended sample for analysis...")
long_sample = generate_text(
    model=TRAINED_MODEL,
    seed_text="To be or not to be",
    length=500,
    temperature=0.8,
    char_to_idx=char_to_idx,
    idx_to_char=idx_to_char,
    seq_length=CONFIG['sequence_length']
)

print(f"\nüìù Generated Text (500 characters, temp=0.8):")
print("‚îÄ"*70)
print(long_sample)
print("‚îÄ"*70)

# Analyze generated text
gen_analysis = analyze_text(long_sample)

print(f"\nüìä Generated Text Metrics:")
print(f"  ‚îú‚îÄ Total Words: {gen_analysis['word_count']}")
print(f"  ‚îú‚îÄ Unique Words: {gen_analysis['unique_words']}")
print(f"  ‚îú‚îÄ Vocabulary Diversity: {gen_analysis['vocabulary_diversity']:.1%}")
print(f"  ‚îú‚îÄ Average Word Length: {gen_analysis['avg_word_length']:.2f} characters")
print(f"  ‚îú‚îÄ Sentence-like Structures: {gen_analysis['sentences']}")
print(f"  ‚îî‚îÄ Line Breaks: {gen_analysis['lines']}")

# Compare with original Shakespeare
print(f"\nüìö Original Shakespeare Metrics (for comparison):")
orig_sample = test_text[:len(long_sample)]
orig_analysis = analyze_text(orig_sample)

print(f"  ‚îú‚îÄ Total Words: {orig_analysis['word_count']}")
print(f"  ‚îú‚îÄ Unique Words: {orig_analysis['unique_words']}")
print(f"  ‚îú‚îÄ Vocabulary Diversity: {orig_analysis['vocabulary_diversity']:.1%}")
print(f"  ‚îú‚îÄ Average Word Length: {orig_analysis['avg_word_length']:.2f} characters")
print(f"  ‚îú‚îÄ Sentence-like Structures: {orig_analysis['sentences']}")
print(f"  ‚îî‚îÄ Line Breaks: {orig_analysis['lines']}")

# Comparison
print(f"\nüìà Comparison:")
diversity_diff = (gen_analysis['vocabulary_diversity'] - 
                 orig_analysis['vocabulary_diversity']) * 100
word_len_diff = gen_analysis['avg_word_length'] - orig_analysis['avg_word_length']

print(f"  ‚îú‚îÄ Diversity difference: {diversity_diff:+.1f}% " + 
      f"({'better' if diversity_diff > 0 else 'worse'})")
print(f"  ‚îî‚îÄ Word length difference: {word_len_diff:+.2f} chars " +
      f"({'more complex' if word_len_diff > 0 else 'simpler'})")

TEXT_ANALYSIS = {
    'generated': gen_analysis,
    'original': orig_analysis
}

In [None]:
"""Visualize text quality comparison"""

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Vocabulary metrics
categories = ['Word Count', 'Unique Words', 'Vocabulary\nDiversity (%)']
gen_vals = [
    gen_analysis['word_count'],
    gen_analysis['unique_words'],
    gen_analysis['vocabulary_diversity'] * 100
]
orig_vals = [
    orig_analysis['word_count'],
    orig_analysis['unique_words'],
    orig_analysis['vocabulary_diversity'] * 100
]

x = np.arange(len(categories))
width = 0.35

axes[0, 0].bar(x - width/2, gen_vals, width, label='Generated', color='steelblue', alpha=0.8)
axes[0, 0].bar(x + width/2, orig_vals, width, label='Original Shakespeare', color='coral', alpha=0.8)
axes[0, 0].set_ylabel('Value', fontsize=11)
axes[0, 0].set_title('Vocabulary Metrics Comparison', fontsize=12, fontweight='bold')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(categories)
axes[0, 0].legend()
axes[0, 0].grid(axis='y', alpha=0.3)

# Plot 2: Average word length
axes[0, 1].bar(['Generated', 'Original'], 
              [gen_analysis['avg_word_length'], orig_analysis['avg_word_length']],
              color=['steelblue', 'coral'], alpha=0.8)
axes[0, 1].set_ylabel('Characters', fontsize=11)
axes[0, 1].set_title('Average Word Length', fontsize=12, fontweight='bold')
axes[0, 1].grid(axis='y', alpha=0.3)

# Plot 3: Character distribution (top 10)
top_chars_gen = Counter(long_sample).most_common(10)
top_chars_orig = Counter(orig_sample).most_common(10)

chars_gen, counts_gen = zip(*top_chars_gen)
chars_orig, counts_orig = zip(*top_chars_orig)

x_pos = np.arange(len(chars_gen))
axes[1, 0].bar(x_pos, counts_gen, color='steelblue', alpha=0.8, label='Generated')
axes[1, 0].set_ylabel('Frequency', fontsize=11)
axes[1, 0].set_title('Top 10 Character Distribution (Generated)', fontsize=12, fontweight='bold')
axes[1, 0].set_xticks(x_pos)
axes[1, 0].set_xticklabels(chars_gen, fontsize=10)
axes[1, 0].grid(axis='y', alpha=0.3)

# Plot 4: Sentence-like structures
axes[1, 1].bar(['Generated', 'Original'],
              [gen_analysis['sentences'], orig_analysis['sentences']],
              color=['steelblue', 'coral'], alpha=0.8)
axes[1, 1].set_ylabel('Count', fontsize=11)
axes[1, 1].set_title('Sentence-like Structures', fontsize=12, fontweight='bold')
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('text_quality_analysis.png', dpi=100, bbox_inches='tight')
plt.show()

print("‚úì Text quality visualization saved to text_quality_analysis.png")

## Section 6: Model Architecture and Parameters

Detailed analysis of model structure, parameter count, and size.

In [None]:
"""Display model architecture and parameter analysis"""

print("="*70)
print("MODEL ARCHITECTURE ANALYSIS")
print("="*70)

print("\nüèóÔ∏è Model Summary:")
print("-"*70)
TRAINED_MODEL.summary()

# Parameter analysis
total_params = TRAINED_MODEL.count_params()
trainable_params = sum([np.prod(w.shape) for w in TRAINED_MODEL.trainable_weights])
non_trainable_params = total_params - trainable_params

print(f"\nüìä Parameter Statistics:")
print(f"  ‚îú‚îÄ Total Parameters: {total_params:,}")
print(f"  ‚îú‚îÄ Trainable Parameters: {trainable_params:,} ({100*trainable_params/total_params:.1f}%)")
print(f"  ‚îî‚îÄ Non-trainable Parameters: {non_trainable_params:,}")

# Model size estimation (assuming float32)
model_size_mb_fp32 = (total_params * 4) / (1024 * 1024)
model_size_mb_fp16 = (total_params * 2) / (1024 * 1024)
model_size_mb_int8 = (total_params * 1) / (1024 * 1024)

print(f"\nüíæ Model Size Estimation:")
print(f"  ‚îú‚îÄ Float32 (FP32): {model_size_mb_fp32:.2f} MB")
print(f"  ‚îú‚îÄ Float16 (FP16): {model_size_mb_fp16:.2f} MB (2x compression)")
print(f"  ‚îî‚îÄ Int8 (INT8):    {model_size_mb_int8:.2f} MB (4x compression)")

# Configuration summary
print(f"\n‚öôÔ∏è  Configuration Summary:")
print(f"  ‚îú‚îÄ Vocabulary Size: {vocab_size}")
print(f"  ‚îú‚îÄ Sequence Length: {CONFIG['sequence_length']}")
print(f"  ‚îú‚îÄ Embedding Dim: {CONFIG['embedding_dim']}")
print(f"  ‚îú‚îÄ Bi-LSTM Units: [{CONFIG['lstm_units']}, {CONFIG['lstm_units_2']}]")
print(f"  ‚îú‚îÄ Dense Units: {CONFIG['dense_units']}")
print(f"  ‚îî‚îÄ Dropout Rate: {CONFIG['dropout_rate']}")

# Layer-by-layer breakdown
print(f"\nüìã Layer-by-Layer Breakdown:")
for i, layer in enumerate(TRAINED_MODEL.layers):
    print(f"  {i+1}. {layer.name:.<30} {layer.output_shape}")

ARCHITECTURE_INFO = {
    'total_params': total_params,
    'trainable_params': trainable_params,
    'model_size_mb_fp32': model_size_mb_fp32,
    'model_size_mb_int8': model_size_mb_int8
}

In [None]:
"""Create visualization of model architecture"""

from tensorflow.keras.utils import plot_model

# Plot model architecture
try:
    plot_model(
        TRAINED_MODEL,
        to_file='model_architecture.png',
        show_shapes=True,
        show_layer_activations=True,
        rankdir='TB',
        expand_nested=True,
        dpi=100
    )
    print("‚úì Model architecture diagram saved to model_architecture.png")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not generate architecture diagram: {e}")

# Create parameter visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Parameter distribution by layer
param_by_layer = {}
for layer in TRAINED_MODEL.layers:
    params = layer.count_params()
    if params > 0:
        param_by_layer[layer.name] = params

sorted_layers = sorted(param_by_layer.items(), key=lambda x: x[1], reverse=True)[:8]
layer_names = [name for name, _ in sorted_layers]
layer_params = [params for _, params in sorted_layers]

colors = plt.cm.Set3(np.linspace(0, 1, len(layer_names)))
ax1.barh(layer_names, layer_params, color=colors, alpha=0.8)
ax1.set_xlabel('Parameters', fontsize=11)
ax1.set_title('Parameters by Layer (Top 8)', fontsize=12, fontweight='bold')
ax1.grid(axis='x', alpha=0.3)

# Add values
for i, (name, params) in enumerate(zip(layer_names, layer_params)):
    ax1.text(params, i, f' {params:,}', va='center', fontsize=9)

# Parameter composition pie chart
param_categories = {
    'Embedding': vocab_size * CONFIG['embedding_dim'],
    'LSTM Layers': total_params - (vocab_size * CONFIG['embedding_dim']) - (CONFIG['dense_units'] * (CONFIG['lstm_units_2'] * 2 + 1)) - (vocab_size * CONFIG['dense_units']),
    'Dense Hidden': CONFIG['dense_units'] * (CONFIG['lstm_units_2'] * 2 + 1),
    'Output': vocab_size * CONFIG['dense_units']
}

ax2.pie(param_categories.values(), labels=param_categories.keys(), autopct='%1.1f%%',
       colors=colors, startangle=90)
ax2.set_title('Parameter Composition', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('model_parameters.png', dpi=100, bbox_inches='tight')
plt.show()

print("‚úì Model parameter visualization saved to model_parameters.png")

## Section 7: Final Summary and Recommendations

Comprehensive evaluation summary with recommendations for model improvement.

In [None]:
"""Generate comprehensive final summary report"""

print("="*70)
print("FINAL EVALUATION REPORT")
print("="*70)

report = f"""
{'‚ñà'*70}
CHARACTER-LEVEL BI-LSTM SHAKESPEARE TEXT GENERATION
Comprehensive Test and Evaluation Report
{'‚ñà'*70}

1. QUANTITATIVE METRICS
{'‚îÄ'*70}

Test Accuracy:           {EVAL_RESULTS['test_accuracy']*100:>6.2f}% (target: >70%)
Top-5 Accuracy:          {EVAL_RESULTS['test_top5_accuracy']*100:>6.2f}%
Loss (Sparse CE):        {EVAL_RESULTS['test_loss']:>6.4f}
Perplexity:              {EVAL_RESULTS['perplexity']:>6.2f} (target: <10)

2. MODEL PARAMETERS
{'‚îÄ'*70}

Total Parameters:        {ARCHITECTURE_INFO['total_params']:>6,}
Trainable Parameters:    {ARCHITECTURE_INFO['trainable_params']:>6,}
Model Size (FP32):       {ARCHITECTURE_INFO['model_size_mb_fp32']:>6.2f} MB
Model Size (INT8):       {ARCHITECTURE_INFO['model_size_mb_int8']:>6.2f} MB

3. INFERENCE PERFORMANCE
{'‚îÄ'*70}

Optimal Batch Size:      {optimal_batch:>6}
Throughput:              {BENCHMARK_RESULTS[optimal_batch]['throughput']:>6.1f} samples/sec
Per-Sample Latency:      {BENCHMARK_RESULTS[optimal_batch]['per_sample_ms']:>6.3f} ms

4. TEXT QUALITY METRICS
{'‚îÄ'*70}

Generated Text:
  Vocabulary Diversity:  {TEXT_ANALYSIS['generated']['vocabulary_diversity']*100:>6.1f}%
  Avg Word Length:       {TEXT_ANALYSIS['generated']['avg_word_length']:>6.2f} chars
  Sentence Structures:   {TEXT_ANALYSIS['generated']['sentences']:>6.0f}

Original Shakespeare:
  Vocabulary Diversity:  {TEXT_ANALYSIS['original']['vocabulary_diversity']*100:>6.1f}%
  Avg Word Length:       {TEXT_ANALYSIS['original']['avg_word_length']:>6.2f} chars
  Sentence Structures:   {TEXT_ANALYSIS['original']['sentences']:>6.0f}

5. PERFORMANCE ASSESSMENT
{'‚îÄ'*70}

"""

# Performance level
if EVAL_RESULTS['test_accuracy'] >= 0.70:
    rating = "üåü EXCELLENT"
    desc = "Exceeds 70% accuracy target!"
elif EVAL_RESULTS['test_accuracy'] >= 0.60:
    rating = "‚úÖ GOOD"
    desc = "Strong performance, approaching target"
elif EVAL_RESULTS['test_accuracy'] >= 0.50:
    rating = "‚ö†Ô∏è  MODERATE"
    desc = "Decent baseline, further training recommended"
else:
    rating = "‚ùå NEEDS WORK"
    desc = "Early training stage, more epochs needed"

report += f"""
Overall Rating:          {rating}
Assessment:              {desc}

6. RECOMMENDATIONS
{'‚îÄ'*70}

To improve accuracy beyond {EVAL_RESULTS['test_accuracy']*100:.1f}%:
  ‚úì Train for more epochs ({CONFIG['epochs']} ‚Üí 50-100)
  ‚úì Use lower initial learning rate (0.001)
  ‚úì Implement curriculum learning (start with short sequences)
  ‚úì Add data augmentation techniques
  ‚úì Experiment with learning rate schedules

To improve inference speed:
  ‚úì Use batch inference (current optimal: batch={optimal_batch})
  ‚úì Quantize model to INT8 ({ARCHITECTURE_INFO['model_size_mb_int8']:.1f} MB vs {ARCHITECTURE_INFO['model_size_mb_fp32']:.1f} MB)
  ‚úì Consider TFLite conversion for mobile deployment
  ‚úì Use GPU acceleration if available

7. MODEL STATE
{'‚îÄ'*70}

Training Source:         {'From Scratch' if TRAINED_FROM_SCRATCH else 'Pre-trained Model'}
Data Used:               Shakespeare.txt ({len(text):,} characters)
Training Sequences:      {len(X_train):,}
Validation Sequences:    {len(X_val):,}
Test Sequences:          {len(X_test):,}

{'‚ñà'*70}
Report Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
{'‚ñà'*70}
"""

print(report)

# Save report to file
with open('evaluation_report_detailed.txt', 'w') as f:
    f.write(report)

print("\n‚úì Detailed report saved to evaluation_report_detailed.txt")

## Interactive Testing (Optional)

Use the cells below to generate text with custom prompts and parameters.

In [None]:
"""Interactive text generation with custom parameters"""

# Set custom parameters here
CUSTOM_PROMPT = "To be or not"
CUSTOM_LENGTH = 300
CUSTOM_TEMPERATURE = 0.8

print("="*70)
print("CUSTOM TEXT GENERATION")
print("="*70)

print(f"\n‚öôÔ∏è  Parameters:")
print(f"  Prompt: '{CUSTOM_PROMPT}'")
print(f"  Length: {CUSTOM_LENGTH} characters")
print(f"  Temperature: {CUSTOM_TEMPERATURE}")

print(f"\nüé≠ Generating text...")
custom_text = generate_text(
    model=TRAINED_MODEL,
    seed_text=CUSTOM_PROMPT,
    length=CUSTOM_LENGTH,
    temperature=CUSTOM_TEMPERATURE,
    char_to_idx=char_to_idx,
    idx_to_char=idx_to_char,
    seq_length=CONFIG['sequence_length']
)

print(f"\nüìù Generated text:")
print("‚îÄ"*70)
print(custom_text)
print("‚îÄ"*70)

# Analyze this generation
custom_analysis = analyze_text(custom_text)
print(f"\nüìä Analysis:")
print(f"  Word count: {custom_analysis['word_count']}")
print(f"  Unique words: {custom_analysis['unique_words']}")
print(f"  Vocabulary diversity: {custom_analysis['vocabulary_diversity']:.1%}")

In [None]:
"""Generate multiple text samples for qualitative review"""

print("="*70)
print("BATCH GENERATION")
print("="*70)

# Generate 5 samples with temperature 0.8
prompt = "The love of"
num_samples = 5

print(f"\nüîÑ Generating {num_samples} samples with prompt: '{prompt}'")
print(f"   Temperature: 0.8, Length: 200 characters")
print(f"{'‚îÄ'*70}\n")

for i in range(num_samples):
    sample = generate_text(
        model=TRAINED_MODEL,
        seed_text=prompt,
        length=200,
        temperature=0.8,
        char_to_idx=char_to_idx,
        idx_to_char=idx_to_char,
        seq_length=CONFIG['sequence_length']
    )
    
    print(f"Sample {i+1}:")
    print(sample[:180] + "...")
    print()

In [None]:
"""Save all results and artifacts"""

print("="*70)
print("SAVING RESULTS")
print("="*70)

# Create results dictionary
all_results = {
    'evaluation': EVAL_RESULTS,
    'benchmark': {k: dict(v) for k, v in BENCHMARK_RESULTS.items()},
    'architecture': ARCHITECTURE_INFO,
    'text_analysis': {
        'generated': {k: v for k, v in TEXT_ANALYSIS['generated'].items() 
                     if k != 'char_distribution'},
        'original': {k: v for k, v in TEXT_ANALYSIS['original'].items() 
                    if k != 'char_distribution'}
    },
    'config': CONFIG
}

# Save as JSON
import json
with open('test_results.json', 'w') as f:
    json.dump(all_results, f, indent=2)

print("‚úì Results saved to test_results.json")

# Save character mappings
save_char_mappings(char_to_idx, idx_to_char)

print("\nüìÅ Generated Files:")
print("  ‚îú‚îÄ benchmark_results.png")
print("  ‚îú‚îÄ text_quality_analysis.png")
print("  ‚îú‚îÄ model_parameters.png")
print("  ‚îú‚îÄ evaluation_report_detailed.txt")
print("  ‚îú‚îÄ test_results.json")
print("  ‚îî‚îÄ models/char_mappings.pkl")

print("\n‚úÖ All results saved successfully!")