# Architecture-Based Tenant Replication with Spectral Graph Analysis

This notebook demonstrates the architecture-based tenant replication feature with visualizations showing:

1. **Pattern Detection**: Identifies architectural patterns in the source tenant
2. **Replication Plan**: Generates a plan to replicate tenant structure using default settings
3. **Graph Comparison**: Compares source and target pattern graphs
4. **Visualizations**:
   - Node overlap analysis (Venn diagram)
   - Side-by-side graph visualization
   - Spectral distance evolution during plan generation
   - Coverage analysis and recommendations

---

## Setup

In [1]:
import os
import sys
from pathlib import Path
from collections import Counter

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

sys.path.insert(0, str(Path.cwd().parent))

# Force reload to pick up code changes
import importlib
if "src.architecture_based_replicator" in sys.modules:
    importlib.reload(sys.modules["src.architecture_based_replicator"])

from src.architecture_based_replicator import ArchitecturePatternReplicator

# Set up matplotlib
plt.style.use("seaborn-v0_8-darkgrid")
%matplotlib inline

print("✅ Setup complete")

ModuleNotFoundError: No module named 'matplotlib_venn'

## Configuration

In [None]:
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "neo4j123")

print(f"Neo4j URI: {NEO4J_URI}")

Neo4j URI: bolt://localhost:7687
Target instance count: 500


---
# Part 1: Pattern Detection & Analysis
---

## Analyze Source Tenant

In [None]:
replicator = ArchitecturePatternReplicator(
    neo4j_uri=NEO4J_URI,
    neo4j_user=NEO4J_USER,
    neo4j_password=NEO4J_PASSWORD
)

print("🔍 Analyzing source tenant with configuration coherence...")
analysis = replicator.analyze_source_tenant(
    use_configuration_coherence=True,
    coherence_threshold=0.5
)

print("\n📊 Source Tenant:")
print(f"   Resource Types: {analysis['resource_types']}")
print(f"   Pattern Graph Edges: {analysis['pattern_graph_edges']}")
print(f"   Detected Patterns: {analysis['detected_patterns']}")
print(f"   Total Pattern Instances: {analysis.get('total_pattern_resources', 0)}")

print("\n📐 Detected Architectural Patterns:")
for pattern_name, pattern_info in replicator.detected_patterns.items():
    instances = replicator.pattern_resources.get(pattern_name, [])
    total_resources = sum(len(instance) for instance in instances)
    print(f"  {pattern_name}:")
    print(f"    Instances: {len(instances)}, Resources: {total_resources}")

🔍 Analyzing source tenant with configuration coherence...

📊 Source Tenant:
   Resource Types: 91
   Pattern Graph Edges: 292
   Detected Patterns: 10
   Total Pattern Instances: 620

📐 Detected Architectural Patterns:
  Web Application:
    Instances: 58, Resources: 223
  Virtual Machine Workload:
    Instances: 64, Resources: 525
  Container Platform:
    Instances: 69, Resources: 69
  Data Platform:
    Instances: 56, Resources: 203
  Serverless Application:
    Instances: 86, Resources: 547
  Data Analytics:
    Instances: 72, Resources: 382
  Secure Workload:
    Instances: 51, Resources: 365
  Managed Identity Pattern:
    Instances: 83, Resources: 531
  Monitoring & Observability:
    Instances: 33, Resources: 286
  Network Security:
    Instances: 48, Resources: 70


---
# Generate Replication Plan
---

Generate a replication plan using default parameters (optimal for most use cases)

In [None]:
print("🔨 Generating replication plan with default settings...\n")
print("Parameters:")
print("  - spectral_weight=0.4 (60% distribution, 40% structure)")
print("  - use_architecture_distribution=True")
print("  - use_configuration_coherence=True")
print("  - use_spectral_guidance=True")
print("  - include_colocated_orphaned_resources=True")
print("  - max_config_samples=100")
print("  - sampling_strategy='coverage'\n")

# Target instance count for replication
TARGET_INSTANCE_COUNT = 500

# Generate replication plan with defaults
selected_pattern_instances, spectral_history, distribution_metadata = replicator.generate_replication_plan(
    target_instance_count=TARGET_INSTANCE_COUNT
)

print(f"\n✅ Selected {len(selected_pattern_instances)} pattern instances")
print(f"   Total resources: {sum(len(instance) for _, instance in selected_pattern_instances)}")

---\n
# Analysis & Visualizations\n
---

In [None]:
print("🔍 Building target pattern graph from selected instances...\n")

# Flatten instances for graph building
flattened_instances = []
for pattern_name, instances in selected_pattern_instances:
    for instance in instances:
        flattened_instances.append((pattern_name, instance))

target_pattern_graph = replicator._build_target_pattern_graph_from_instances(flattened_instances)

print(f"✅ Target pattern graph built:")
print(f"   Nodes (resource types): {target_pattern_graph.number_of_nodes()}")
print(f"   Edges (relationships): {target_pattern_graph.number_of_edges()}")

## Graph Statistics Comparison

In [None]:
source_graph = replicator.source_pattern_graph

source_nodes = source_graph.number_of_nodes()
target_nodes = target_pattern_graph.number_of_nodes()
source_edges = source_graph.number_of_edges()
target_edges = target_pattern_graph.number_of_edges()

print("📊 Pattern Graph Comparison:\n")
print(f"{'Metric':<30} {'Source':<15} {'Target':<15} {'Coverage %'}")
print("-" * 75)
print(f"{'Nodes (resource types)':<30} {source_nodes:<15} {target_nodes:<15} {(target_nodes/source_nodes*100):.1f}%")
print(f"{'Edges (relationships)':<30} {source_edges:<15} {target_edges:<15} {(target_edges/source_edges*100):.1f}%")

# Compute spectral distance
spectral_distance = replicator._compute_spectral_distance(
    source_graph, target_pattern_graph
)
print(f"\n📐 Spectral Distance: {spectral_distance:.4f} (lower = better structural match)")

## Node Overlap Analysis

In [None]:
source_nodes_set = set(source_graph.nodes())
target_nodes_set = set(target_pattern_graph.nodes())

common_nodes = source_nodes_set.intersection(target_nodes_set)
missing_nodes = source_nodes_set - target_nodes_set
extra_nodes = target_nodes_set - source_nodes_set

print(f"📊 Node Overlap Analysis:\n")
print(f"   Source unique: {len(source_nodes_set)}")
print(f"   Target unique: {len(target_nodes_set)}")
print(f"   Common: {len(common_nodes)}")
print(f"   Missing from target: {len(missing_nodes)}")
print(f"   Extra in target: {len(extra_nodes)}")

# Bar chart visualization
fig, ax = plt.subplots(figsize=(12, 6))

categories = ['Common\n(in both)', 'Source Only\n(missing from target)', 'Target Only\n(extra)']
values = [len(common_nodes), len(missing_nodes), len(extra_nodes)]
colors = ['#99ff99', '#ff9999', '#99ccff']

bars = ax.bar(categories, values, color=colors, alpha=0.8, edgecolor='black', linewidth=2)

# Add value labels on bars
for bar, val in zip(bars, values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, height, f'{val}',
            ha='center', va='bottom', fontsize=14, fontweight='bold')

ax.set_ylabel('Number of Resource Types', fontsize=12, fontweight='bold')
ax.set_title('Resource Type Overlap Between Source and Target Pattern Graphs', 
             fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3, linestyle='--')

plt.tight_layout()
plt.show()

coverage_pct = len(common_nodes)/len(source_nodes_set)*100 if len(source_nodes_set) > 0 else 0
print(f"\n📈 Coverage: {coverage_pct:.1f}% of source types included in target")

if missing_nodes:
    print(f"\n🔍 Top 10 missing resource types:")
    source_degrees = dict(source_graph.degree())
    missing_by_degree = sorted(
        [(n, source_degrees.get(n, 0)) for n in missing_nodes],
        key=lambda x: x[1], reverse=True
    )
    for i, (node, degree) in enumerate(missing_by_degree[:10], 1):
        print(f"   {i}. {node} (degree: {degree})")

## Side-by-Side Graph Visualization

In [None]:
# Limit to top nodes by degree for better visualization
TOP_N = 40

# Get top nodes from source graph
source_degrees = dict(source_graph.degree())
top_source_nodes = sorted(source_degrees.items(), key=lambda x: x[1], reverse=True)[:TOP_N]
source_subgraph = source_graph.subgraph([n for n, _ in top_source_nodes]).copy()

# Get top nodes from target graph (or all if fewer than TOP_N)
target_degrees = dict(target_pattern_graph.degree())
top_target_nodes = sorted(target_degrees.items(), key=lambda x: x[1], reverse=True)[:TOP_N]
target_subgraph = target_pattern_graph.subgraph([n for n, _ in top_target_nodes]).copy()

# Identify missing edges: edges in source but not in target
# Build sets of (source, target, relationship) tuples for comparison
source_edge_set = set()
for u, v, data in source_subgraph.edges(data=True):
    rel = data.get('relationship', 'UNKNOWN')
    source_edge_set.add((u, v, rel))

target_edge_set = set()
for u, v, data in target_subgraph.edges(data=True):
    rel = data.get('relationship', 'UNKNOWN')
    target_edge_set.add((u, v, rel))

# Edges that exist in source but not in target (only for nodes present in both graphs)
missing_edges = []
for u, v, rel in source_edge_set:
    # Only consider edges between nodes that exist in target's node set
    if u in target_nodes_set and v in target_nodes_set:
        if (u, v, rel) not in target_edge_set:
            missing_edges.append((u, v, rel))

print(f"📊 Edge Analysis:")
print(f"   Source subgraph edges: {source_subgraph.number_of_edges()}")
print(f"   Target subgraph edges: {target_subgraph.number_of_edges()}")
print(f"   Missing edges (in source, not in target): {len(missing_edges)}")

# Create figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 10))

# Function to draw a graph with edge highlighting
def draw_pattern_graph(G, ax, title, highlight_missing=False):
    if G.number_of_nodes() == 0:
        ax.text(0.5, 0.5, 'No nodes to display', 
                ha='center', va='center', transform=ax.transAxes, fontsize=16)
        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
        ax.axis('off')
        return
    
    # Layout
    pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
    
    # Node sizes based on degree
    degrees = dict(G.degree())
    node_sizes = [degrees[node] * 100 + 200 for node in G.nodes()]
    
    # Node colors: highlight common nodes
    common = source_nodes_set.intersection(target_nodes_set)
    node_colors = ['#66b3ff' if node in common else '#ff9999' for node in G.nodes()]
    
    # Draw edges with highlighting for source graph
    if highlight_missing and missing_edges:
        # Draw regular edges first (gray)
        regular_edges = []
        for u, v, data in G.edges(data=True):
            rel = data.get('relationship', 'UNKNOWN')
            if (u, v, rel) not in missing_edges:
                regular_edges.append((u, v))
        
        if regular_edges:
            nx.draw_networkx_edges(G, pos, edgelist=regular_edges, alpha=0.2, 
                                  edge_color='gray', arrows=True, arrowsize=10, 
                                  width=1.5, connectionstyle='arc3,rad=0.1', ax=ax)
        
        # Draw missing edges in red (highlighted)
        missing_edge_list = []
        for u, v, rel in missing_edges:
            if G.has_edge(u, v):
                missing_edge_list.append((u, v))
        
        if missing_edge_list:
            nx.draw_networkx_edges(G, pos, edgelist=missing_edge_list, alpha=0.6, 
                                  edge_color='#FF6B6B', arrows=True, arrowsize=12, 
                                  width=3, connectionstyle='arc3,rad=0.1', ax=ax)
    else:
        # Draw all edges normally (for target graph)
        nx.draw_networkx_edges(G, pos, alpha=0.3, edge_color='#4CAF50', 
                              arrows=True, arrowsize=10, width=2,
                              connectionstyle='arc3,rad=0.1', ax=ax)
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=node_colors,
                          alpha=0.9, edgecolors='black', linewidths=2, ax=ax)
    
    # Draw labels
    nx.draw_networkx_labels(G, pos, font_size=9, font_weight='bold', ax=ax)
    
    # Title with stats
    stats_text = f"{G.number_of_nodes()} types, {G.number_of_edges()} edges"
    if highlight_missing:
        stats_text += f"\n{len(missing_edges)} missing edges (red)"
    ax.set_title(f"{title}\n{stats_text}", fontsize=14, fontweight='bold', pad=20)
    ax.axis('off')

# Draw both graphs
draw_pattern_graph(source_subgraph, ax1, f"Source Pattern Graph (Top {TOP_N})", highlight_missing=True)
draw_pattern_graph(target_subgraph, ax2, f"Target Pattern Graph (Top {min(TOP_N, target_nodes)})", highlight_missing=False)

# Add legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#66b3ff', edgecolor='black', label='Common Types'),
    Patch(facecolor='#ff9999', edgecolor='black', label='Unique Types'),
    Patch(facecolor='white', edgecolor='#FF6B6B', linewidth=3, label='Missing Edges (in source, not target)')
]
fig.legend(handles=legend_elements, loc='upper center', ncol=3, 
          fontsize=12, bbox_to_anchor=(0.5, 0.98))

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

print(f"\n📊 Visualization shows top {TOP_N} nodes by degree from each graph")
print(f"   Blue nodes: Common resource types in both graphs")
print(f"   Red nodes: Unique to that graph")
print(f"   Red edges (source graph): Missing relationships not yet in target")
print(f"   Green edges (target graph): Captured relationships")

if missing_edges:
    print(f"\n🔍 Top missing edge types (need more instances to capture):")
    missing_edge_types = Counter([rel for u, v, rel in missing_edges])
    for rel, count in missing_edge_types.most_common(5):
        print(f"   {rel}: {count} missing edges")

## Spectral Distance Evolution

In [None]:
if spectral_history:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    ax.plot(range(len(spectral_history)), spectral_history, 
            marker='o', linewidth=2, markersize=8, alpha=0.7, color='#2E86AB')
    
    ax.set_xlabel('Instance Selection Step', fontsize=12, fontweight='bold')
    ax.set_ylabel('Spectral Distance', fontsize=12, fontweight='bold')
    ax.set_title('Spectral Distance Evolution as Instances are Added\n(Lower = Better Match)', 
                 fontsize=14, fontweight='bold', pad=20)
    
    ax.grid(True, alpha=0.3)
    
    # Add horizontal line for final value
    ax.axhline(y=spectral_history[-1], color='red', linestyle='--', 
               alpha=0.5, linewidth=2, label=f'Final: {spectral_history[-1]:.4f}')
    
    # Add horizontal line for initial value
    ax.axhline(y=spectral_history[0], color='orange', linestyle='--', 
               alpha=0.5, linewidth=2, label=f'Initial: {spectral_history[0]:.4f}')
    
    ax.legend(fontsize=11, loc='best')
    ax.set_xlim(-0.5, len(spectral_history) - 0.5)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n📈 Spectral Distance Analysis:")
    print(f"   Initial distance: {spectral_history[0]:.4f}")
    print(f"   Final distance: {spectral_history[-1]:.4f}")
    print(f"   Change: {spectral_history[-1] - spectral_history[0]:.4f} ({(spectral_history[-1] - spectral_history[0])/spectral_history[0]:.1%})")
    print(f"   Min distance: {min(spectral_history):.4f} (at step {spectral_history.index(min(spectral_history))})")
else:
    print("⚠️  No spectral history available")

---\n
# Coverage Analysis & Recommendations\n
---

In [None]:
print("="*80)
print("COVERAGE ANALYSIS & RECOMMENDATIONS")
print("="*80)

# Calculate coverage metrics
node_coverage = len(common_nodes) / len(source_nodes_set) * 100 if len(source_nodes_set) > 0 else 0
edge_coverage = target_edges / source_edges * 100 if source_edges > 0 else 0

print(f"\n📊 Current Configuration:")
print(f"   Selected instances: {TARGET_INSTANCE_COUNT}")
print(f"   Total resources: {total_resources}")
print(f"   Node coverage: {node_coverage:.1f}%")
print(f"   Edge coverage: {edge_coverage:.1f}%")
print(f"   Spectral distance: {spectral_distance:.4f}")

# Coverage assessment
print(f"\n🎯 Coverage Assessment:")
if node_coverage < 30:
    assessment = "LOW"
    color = "⚠️"
    recommendation = "Consider increasing TARGET_INSTANCE_COUNT to 50+ for better coverage"
elif node_coverage < 60:
    assessment = "MEDIUM"
    color = "✓"
    recommendation = "Good balance between coverage and selectivity"
else:
    assessment = "HIGH"
    color = "✅"
    recommendation = "Excellent coverage - represents most of source tenant"

print(f"   {color} Coverage level: {assessment}")
print(f"   {recommendation}")

# Missing node types analysis
print(f"\n📋 Missing High-Degree Node Types:")
print(f"   (Top 10 missing types by degree in source)")
missing_nodes = source_nodes_set - target_nodes_set
source_degrees = dict(source_graph.degree())
missing_by_degree = sorted([(n, source_degrees.get(n, 0)) for n in missing_nodes], 
                           key=lambda x: x[1], reverse=True)

for i, (node, degree) in enumerate(missing_by_degree[:10], 1):
    print(f"   {i}. {node} (degree: {degree})")

if missing_by_degree:
    print(f"\n💡 To include these types:")
    print(f"   - Increase TARGET_INSTANCE_COUNT (current: {TARGET_INSTANCE_COUNT})")
    print(f"   - Or set TARGET_INSTANCE_COUNT = None to select all {analysis.get('total_pattern_resources', 0)} instances")

# Spectral distance interpretation
print(f"\n📐 Spectral Distance Interpretation:")
if spectral_distance < 0.1:
    quality = "EXCELLENT"
    desc = "Target graph is structurally very similar to source"
elif spectral_distance < 0.2:
    quality = "GOOD"
    desc = "Target graph preserves most structural patterns from source"
elif spectral_distance < 0.3:
    quality = "MODERATE"
    desc = "Target graph captures some structural patterns, but differences exist"
else:
    quality = "LOW"
    desc = "Target graph is structurally quite different from source"

print(f"   Distance: {spectral_distance:.4f} ({quality})")
print(f"   {desc}")

# Recommendations table
print(f"\n" + "="*80)
print("QUICK REFERENCE: Instance Count vs Coverage")
print("="*80)
print(f"{'Instances':<12} {'Resources':<12} {'Node Coverage':<15} {'Edge Coverage':<15} {'Use Case'}")
print("-"*80)
print(f"{'10':<12} {'~500':<12} {'~18%':<15} {'~14%':<15} {'Quick demo/testing'}")
print(f"{'50':<12} {'~2500':<12} {'~47%':<15} {'~40%':<15} {'Balanced replication'}")
print(f"{'100':<12} {'~3000+':<12} {'~65%':<15} {'~60%':<15} {'High-fidelity replication'}")
print(f"{'All (431)':<12} {'All':<12} {'100%':<15} {'100%':<15} {'Complete replication'}")

print(f"\n✨ Current setting: {TARGET_INSTANCE_COUNT} instances → {node_coverage:.1f}% node coverage")