# Algebraic Language Model Exploration

This notebook explores the algebraic API for composing language models with n-gram projections.

In [None]:
# Setup and imports
import sys
sys.path.append('.')

from ngram_projections.models.base import LanguageModel
from ngram_projections.models.ngram import NGramModel
from ngram_projections.models.mixture import MixtureModel
from ngram_projections.projections.recency import RecencyProjection
from ngram_projections.projections.edit_distance import EditDistanceProjection
from ngram_projections.projections.semantic import SemanticProjection
from ngram_projections.projections.attention import AttentionProjection
from ngram_projections.algebra.combinators import compose, ensemble, cascade, memoize

import numpy as np
import matplotlib.pyplot as plt
from typing import List, Dict

# For pretty printing
import json
def pprint(obj):
    if isinstance(obj, dict):
        print(json.dumps(obj, indent=2))
    else:
        print(obj)

## 1. Basic Model Creation and Training

In [None]:
# Create sample training data
training_texts = [
    "The quick brown fox jumps over the lazy dog",
    "The dog chased the cat through the garden",
    "Natural language processing with neural networks",
    "Deep learning models for text generation",
    "The cat sat on the mat in the sunny garden",
    "Machine learning algorithms process natural language",
    "The brown dog ran quickly through the park",
    "Neural networks learn patterns from data",
]

# Create and train n-gram models with different orders
ngram_2 = NGramModel(n=2)
ngram_3 = NGramModel(n=3)
ngram_4 = NGramModel(n=4)

for text in training_texts:
    tokens = text.lower().split()
    ngram_2.train(tokens)
    ngram_3.train(tokens)
    ngram_4.train(tokens)

print("Trained n-gram models with orders 2, 3, and 4")
print(f"Bigram model has {len(ngram_2.counts)} unique n-grams")
print(f"Trigram model has {len(ngram_3.counts)} unique n-grams")
print(f"4-gram model has {len(ngram_4.counts)} unique n-grams")

## 2. Simple Algebraic Operations

In [None]:
# Test basic predictions
context = ["the", "dog"]

print("Individual model predictions:")
print(f"Bigram: {ngram_2.predict(context)}")
print(f"Trigram: {ngram_3.predict(context)}")
print(f"4-gram: {ngram_4.predict(context)}\n")

# Create mixture models using algebraic operations
print("Mixture models using algebraic operations:")

# Equal weight mixture
mixture_equal = ngram_2 + ngram_3
print(f"ngram_2 + ngram_3: {mixture_equal.predict(context)}")

# Weighted mixture
mixture_weighted = 0.3 * ngram_2 + 0.7 * ngram_3
print(f"0.3 * ngram_2 + 0.7 * ngram_3: {mixture_weighted.predict(context)}")

# Three-way mixture
mixture_three = 0.2 * ngram_2 + 0.5 * ngram_3 + 0.3 * ngram_4
print(f"0.2 * ngram_2 + 0.5 * ngram_3 + 0.3 * ngram_4: {mixture_three.predict(context)}")

## 3. Projection Functions

In [None]:
# Create projection functions
recency = RecencyProjection(max_suffix_len=3)
edit_dist = EditDistanceProjection(max_distance=2)
semantic = SemanticProjection(embedding_dim=50)
attention = AttentionProjection()

# Test projections on a longer context
long_context = ["the", "quick", "brown", "fox", "jumps", "over", "the"]

print("Projection results:")
print(f"Original context: {long_context}")
print(f"Recency projection: {recency.project(long_context)}")
print(f"Edit distance projection: {edit_dist.project(long_context)}")
print(f"Semantic projection: {semantic.project(long_context)}")
print(f"Attention projection: {attention.project(long_context)}")

## 4. Models with Projections

In [None]:
# Apply projections to models using @ operator
ngram_recency = ngram_3 @ recency
ngram_semantic = ngram_3 @ semantic
ngram_attention = ngram_3 @ attention

# Compare predictions with different projections
test_context = ["natural", "language", "processing", "with", "neural"]

print("Predictions with different projections:")
print(f"Context: {test_context}\n")
print(f"No projection: {ngram_3.predict(test_context)}")
print(f"With recency: {ngram_recency.predict(test_context)}")
print(f"With semantic: {ngram_semantic.predict(test_context)}")
print(f"With attention: {ngram_attention.predict(test_context)}")

## 5. Composed Projections

In [None]:
# Compose projections using >> operator
recency_then_semantic = recency >> semantic
semantic_then_attention = semantic >> attention

# Union of projections using | operator
recency_or_semantic = recency | semantic

# Apply composed projections
print("Composed projections:")
print(f"Original context: {test_context}\n")
print(f"Recency >> Semantic: {recency_then_semantic.project(test_context)}")
print(f"Semantic >> Attention: {semantic_then_attention.project(test_context)}")
print(f"Recency | Semantic: {recency_or_semantic.project(test_context)}")

# Apply to model
ngram_complex = ngram_3 @ (recency >> semantic)
print(f"\nModel with recency>>semantic: {ngram_complex.predict(test_context)}")

## 6. Mock LLM Integration

In [None]:
# Create mock LLMs for demonstration
from ngram_projections.models.base import LanguageModel

class MockLLM(LanguageModel):
    def __init__(self, name, bias_tokens=None):
        self.name = name
        self.bias_tokens = bias_tokens or []
    
    def predict(self, context: List[str]) -> Dict[str, float]:
        # Simulate LLM predictions with some bias
        base_probs = {
            "networks": 0.15,
            "models": 0.12,
            "algorithms": 0.10,
            "data": 0.08,
            "learning": 0.07,
        }
        
        # Add bias for specific tokens
        for token in self.bias_tokens:
            base_probs[token] = base_probs.get(token, 0) + 0.2
        
        # Normalize
        total = sum(base_probs.values())
        return {k: v/total for k, v in base_probs.items()}
    
    def __repr__(self):
        return f"MockLLM({self.name})"

# Create different LLMs
llm_general = MockLLM("general")
llm_technical = MockLLM("technical", bias_tokens=["algorithms", "networks"])
llm_ml = MockLLM("ml-focused", bias_tokens=["learning", "models", "data"])

## 7. N-gram + LLM Mixtures

In [None]:
# Create mixtures of n-gram and LLM models
context = ["neural", "network"]

# Pure models
print("Individual predictions:")
print(f"N-gram: {ngram_3.predict(context)}")
print(f"General LLM: {llm_general.predict(context)}")
print(f"Technical LLM: {llm_technical.predict(context)}")
print(f"ML LLM: {llm_ml.predict(context)}\n")

# Mixtures
print("Mixture models:")

# N-gram for grounding + LLM for generalization
grounded = 0.4 * ngram_3 + 0.6 * llm_general
print(f"0.4 * ngram + 0.6 * general_llm: {grounded.predict(context)}\n")

# Multiple LLMs with n-gram
ensemble = 0.3 * ngram_3 + 0.3 * llm_technical + 0.4 * llm_ml
print(f"0.3 * ngram + 0.3 * technical + 0.4 * ml: {ensemble.predict(context)}")

## 8. Complex Compositions

In [None]:
# Build sophisticated model compositions

# 1. N-gram with projection + LLM ensemble
sophisticated = 0.3 * (ngram_3 @ recency) + 0.7 * (llm_technical | llm_ml)

# 2. Cascading model with fallback
from ngram_projections.algebra.combinators import cascade

# Create a confidence-based cascade
def confidence_cascade(models, context, threshold=0.3):
    for model in models:
        pred = model.predict(context)
        if pred and max(pred.values()) > threshold:
            return pred
    return models[-1].predict(context)

# Models in order of speed/confidence
fast_model = ngram_2 @ recency
medium_model = ngram_3 @ semantic  
slow_model = 0.5 * llm_technical + 0.5 * llm_ml

# Test cascade
contexts = [
    ["the", "dog"],
    ["neural", "network"],
    ["unknown", "context"],
]

print("Cascade model testing:")
for ctx in contexts:
    result = confidence_cascade([fast_model, medium_model, slow_model], ctx)
    print(f"Context {ctx}: {result}")

## 9. Functional Combinators

In [None]:
from ngram_projections.algebra.combinators import compose, ensemble, memoize

# Compose functions for preprocessing
def lowercase(tokens):
    return [t.lower() for t in tokens]

def limit_length(max_len):
    def limiter(tokens):
        return tokens[-max_len:]
    return limiter

def add_context_token(token):
    def adder(tokens):
        return [token] + tokens
    return adder

# Compose preprocessing pipeline
preprocess = compose(lowercase, limit_length(5), add_context_token("<CTX>"))

test_input = ["The", "Quick", "BROWN", "Fox", "Jumps", "Over", "The", "Lazy", "Dog"]
print(f"Original: {test_input}")
print(f"Preprocessed: {preprocess(test_input)}\n")

# Ensemble with custom aggregation
def weighted_avg(predictions, weights):
    """Custom aggregation for ensemble."""
    result = {}
    for pred, w in zip(predictions, weights):
        for token, prob in pred.items():
            result[token] = result.get(token, 0) + w * prob
    return result

# Create ensemble
models = [ngram_2, ngram_3, llm_technical]
weights = [0.2, 0.5, 0.3]

ensemble_model = ensemble(models, lambda preds: weighted_avg(preds, weights))

# Memoize expensive computations
expensive_model = memoize(0.3 * (ngram_3 @ semantic) + 0.7 * llm_ml)

print("Testing memoized model:")
context = ["machine", "learning"]
print(f"First call: {expensive_model.predict(context)}")
print(f"Second call (cached): {expensive_model.predict(context)}")

## 10. Visualization of Model Behavior

In [None]:
# Visualize how different models and mixtures behave

def plot_predictions(models, context, title="Model Predictions"):
    """Plot prediction distributions for multiple models."""
    fig, axes = plt.subplots(1, len(models), figsize=(15, 4))
    if len(models) == 1:
        axes = [axes]
    
    for ax, (name, model) in zip(axes, models):
        pred = model.predict(context)
        if pred:
            tokens = list(pred.keys())[:10]  # Top 10
            probs = [pred[t] for t in tokens]
            
            ax.bar(range(len(tokens)), probs)
            ax.set_xticks(range(len(tokens)))
            ax.set_xticklabels(tokens, rotation=45, ha='right')
            ax.set_ylabel('Probability')
            ax.set_title(name)
            ax.set_ylim([0, max(probs) * 1.2 if probs else 1])
    
    plt.suptitle(f"{title}\nContext: {' '.join(context)}")
    plt.tight_layout()
    plt.show()

# Compare different models
context = ["neural", "network"]

models_to_compare = [
    ("N-gram", ngram_3),
    ("N-gram + Recency", ngram_3 @ recency),
    ("LLM", llm_technical),
    ("Mixture", 0.3 * ngram_3 + 0.7 * llm_technical),
]

plot_predictions(models_to_compare, context)

## 11. Performance Analysis

In [None]:
import time

def benchmark_model(model, contexts, runs=100):
    """Benchmark model performance."""
    times = []
    for _ in range(runs):
        start = time.time()
        for ctx in contexts:
            _ = model.predict(ctx)
        times.append(time.time() - start)
    return np.mean(times), np.std(times)

# Test contexts
test_contexts = [
    ["the", "dog"],
    ["neural", "network", "models"],
    ["machine", "learning", "algorithms", "process"],
]

# Models to benchmark
models_to_benchmark = {
    "N-gram (2)": ngram_2,
    "N-gram (3)": ngram_3,
    "N-gram + Projection": ngram_3 @ recency,
    "Simple Mixture": ngram_2 + ngram_3,
    "Complex Mixture": 0.3 * (ngram_3 @ recency) + 0.7 * llm_technical,
}

print("Performance Benchmarks (100 runs):")
print("-" * 50)

results = {}
for name, model in models_to_benchmark.items():
    mean_time, std_time = benchmark_model(model, test_contexts, runs=100)
    results[name] = (mean_time * 1000, std_time * 1000)  # Convert to ms
    print(f"{name:25} {mean_time*1000:.2f} ± {std_time*1000:.2f} ms")

# Plot results
fig, ax = plt.subplots(figsize=(10, 5))
names = list(results.keys())
means = [results[n][0] for n in names]
stds = [results[n][1] for n in names]

ax.bar(range(len(names)), means, yerr=stds, capsize=5)
ax.set_xticks(range(len(names)))
ax.set_xticklabels(names, rotation=45, ha='right')
ax.set_ylabel('Time (ms)')
ax.set_title('Model Performance Comparison')
plt.tight_layout()
plt.show()

## 12. Real-World Example: Adaptive Model

In [None]:
class AdaptiveModel:
    """Model that adapts its mixture weights based on context."""
    
    def __init__(self, base_model, adaptation_model, alpha=0.5):
        self.base_model = base_model
        self.adaptation_model = adaptation_model
        self.alpha = alpha
        self.history = []
    
    def predict(self, context):
        # Adapt alpha based on context length
        if len(context) < 3:
            # Short context: rely more on base model
            current_alpha = min(0.7, self.alpha * 1.5)
        else:
            # Long context: rely more on adaptation model  
            current_alpha = max(0.3, self.alpha * 0.7)
        
        # Create dynamic mixture
        mixture = current_alpha * self.base_model + (1 - current_alpha) * self.adaptation_model
        
        # Store for analysis
        self.history.append({
            'context_len': len(context),
            'alpha': current_alpha,
            'context': context
        })
        
        return mixture.predict(context)
    
    def analyze_adaptation(self):
        """Analyze how the model has been adapting."""
        if not self.history:
            print("No history available")
            return
        
        lengths = [h['context_len'] for h in self.history]
        alphas = [h['alpha'] for h in self.history]
        
        plt.figure(figsize=(10, 4))
        plt.scatter(lengths, alphas, alpha=0.6)
        plt.xlabel('Context Length')
        plt.ylabel('Alpha (base model weight)')
        plt.title('Adaptive Model Behavior')
        plt.grid(True, alpha=0.3)
        plt.show()

# Create and test adaptive model
adaptive = AdaptiveModel(
    base_model=ngram_2,
    adaptation_model=ngram_3 @ semantic,
    alpha=0.5
)

# Test with various context lengths
test_cases = [
    ["the"],
    ["the", "dog"],
    ["the", "quick", "brown"],
    ["neural", "network", "models", "learn"],
    ["machine", "learning", "algorithms", "process", "natural", "language"],
]

print("Adaptive Model Predictions:")
print("-" * 50)
for ctx in test_cases:
    pred = adaptive.predict(ctx)
    if pred:
        top_token = max(pred.items(), key=lambda x: x[1])
        print(f"Context: {' '.join(ctx):30} -> {top_token[0]} ({top_token[1]:.3f})")

# Analyze adaptation behavior
adaptive.analyze_adaptation()

## 13. Continuous Learning Simulation

In [None]:
class ContinuousLearningModel:
    """Simulates continuous learning with n-gram updates."""
    
    def __init__(self, static_llm, n=3, ngram_weight=0.3):
        self.static_llm = static_llm
        self.ngram = NGramModel(n=n)
        self.ngram_weight = ngram_weight
        self.updates = 0
        self.performance_history = []
    
    def observe(self, text):
        """Observe new data and update n-gram model."""
        tokens = text.lower().split()
        self.ngram.train(tokens)
        self.updates += 1
    
    def predict(self, context):
        """Make prediction with current mixture."""
        mixture = self.ngram_weight * self.ngram + (1 - self.ngram_weight) * self.static_llm
        return mixture.predict(context)
    
    def evaluate(self, test_data):
        """Evaluate current performance."""
        correct = 0
        total = 0
        
        for text in test_data:
            tokens = text.lower().split()
            for i in range(2, len(tokens)):
                context = tokens[:i]
                target = tokens[i]
                
                pred = self.predict(context)
                if pred and target in pred:
                    correct += pred[target]
                total += 1
        
        accuracy = correct / total if total > 0 else 0
        self.performance_history.append((self.updates, accuracy))
        return accuracy

# Simulate continuous learning
print("Simulating Continuous Learning:")
print("=" * 50)

# Initialize model
cl_model = ContinuousLearningModel(
    static_llm=llm_general,
    n=3,
    ngram_weight=0.3
)

# Streaming data (simulated)
stream_data = [
    "The new research on quantum computing shows promise",
    "Quantum algorithms can solve certain problems exponentially faster",
    "Machine learning models require large amounts of data",
    "Deep neural networks learn hierarchical representations",
    "Quantum machine learning combines both paradigms",
]

# Test data
test_data = [
    "quantum computing research",
    "machine learning algorithms",
    "neural networks learn",
]

# Initial evaluation
initial_acc = cl_model.evaluate(test_data)
print(f"Initial accuracy: {initial_acc:.3f}")

# Continuous learning loop
for i, data in enumerate(stream_data):
    cl_model.observe(data)
    acc = cl_model.evaluate(test_data)
    print(f"After update {i+1}: accuracy = {acc:.3f}")

# Plot learning curve
if cl_model.performance_history:
    updates, accuracies = zip(*cl_model.performance_history)
    
    plt.figure(figsize=(10, 5))
    plt.plot(updates, accuracies, 'o-', linewidth=2, markersize=8)
    plt.xlabel('Number of Updates')
    plt.ylabel('Accuracy')
    plt.title('Continuous Learning Performance')
    plt.grid(True, alpha=0.3)
    plt.show()

print(f"\nFinal n-gram size: {len(cl_model.ngram.counts)} unique n-grams")

## 14. Advanced Algebra: Custom Operators

In [None]:
# Define custom operators for specialized compositions

class ContextAwareModel(LanguageModel):
    """Model that changes behavior based on context properties."""
    
    def __init__(self, formal_model, casual_model):
        self.formal_model = formal_model
        self.casual_model = casual_model
    
    def predict(self, context):
        # Detect formality from context
        formal_indicators = {'research', 'algorithm', 'neural', 'quantum', 'model'}
        casual_indicators = {'dog', 'cat', 'quick', 'lazy', 'brown'}
        
        formal_score = sum(1 for token in context if token in formal_indicators)
        casual_score = sum(1 for token in context if token in casual_indicators)
        
        if formal_score > casual_score:
            return self.formal_model.predict(context)
        else:
            return self.casual_model.predict(context)
    
    def __repr__(self):
        return f"ContextAware({self.formal_model} <-> {self.casual_model})"

# Create context-aware model
formal = 0.2 * ngram_3 + 0.8 * llm_technical
casual = 0.6 * ngram_2 + 0.4 * llm_general

context_aware = ContextAwareModel(formal, casual)

# Test with different contexts
test_contexts = [
    ["neural", "network", "algorithm"],  # Formal
    ["the", "dog", "ran"],  # Casual
    ["quantum", "computing", "research"],  # Formal
    ["quick", "brown", "fox"],  # Casual
]

print("Context-Aware Model Predictions:")
print("=" * 50)

for ctx in test_contexts:
    pred = context_aware.predict(ctx)
    if pred:
        top_tokens = sorted(pred.items(), key=lambda x: x[1], reverse=True)[:3]
        print(f"Context: {' '.join(ctx):25}")
        print(f"  Top predictions: {', '.join([f'{t}({p:.2f})' for t, p in top_tokens])}")
        print()

## 15. Summary and Best Practices

In [None]:
# Demonstrate best practices for model composition

print("Best Practices for Algebraic Model Composition")
print("=" * 50)
print()

# 1. Start simple
print("1. Start Simple:")
simple = ngram_3 + llm_general
print(f"   Basic mixture: ngram + llm")
print()

# 2. Add projections for context focus
print("2. Add Projections:")
focused = (ngram_3 @ recency) + llm_general
print(f"   With projection: (ngram @ recency) + llm")
print()

# 3. Use weights for fine-tuning
print("3. Fine-tune with Weights:")
weighted = 0.3 * (ngram_3 @ recency) + 0.7 * llm_general
print(f"   Weighted: 0.3 * (ngram @ recency) + 0.7 * llm")
print()

# 4. Compose projections for sophisticated behavior
print("4. Compose Projections:")
sophisticated = ngram_3 @ (recency >> semantic)
print(f"   Composed: ngram @ (recency >> semantic)")
print()

# 5. Use ensembles for robustness
print("5. Build Ensembles:")
robust = (ngram_2 | ngram_3) + (llm_general | llm_technical)
print(f"   Ensemble: (ngram_2 | ngram_3) + (llm_general | llm_technical)")
print()

# 6. Memoize expensive operations
print("6. Optimize with Memoization:")
optimized = memoize(0.3 * (ngram_3 @ semantic) + 0.7 * llm_ml)
print(f"   Cached: memoize(expensive_mixture)")
print()

print("=" * 50)
print("\nKey Insights:")
print("• Algebraic operations make complex models readable")
print("• Projections focus context for better predictions")
print("• Mixtures combine strengths of different models")
print("• Continuous learning via n-gram updates is efficient")
print("• Composition enables sophisticated behavior from simple parts")