# Component Tracking Dashboard

This notebook provides examples of how to analyze component impact and make data-driven decisions about which components to keep or remove.

In [None]:
# Setup imports
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('.'))))

from src.utils.component_tracker import (
    analyze_component_impact,
    compare_component_versions,
    identify_redundancies,
    ComponentTracker
)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## 1. Component Impact Analysis

Analyze the impact of individual components on the extraction pipeline.

In [None]:
# List of components to analyze
components = [
    "segment_preprocessor",
    "entity_resolution", 
    "metadata_enricher",
    "quote_extractor"
]

# Analyze each component
impact_reports = {}
for component in components:
    try:
        impact = analyze_component_impact(component)
        impact_reports[component] = impact
        print(f"\n=== {component} ===")
        print(f"Recommendation: {impact.get('recommendation', 'No data')}")
        
        if 'metrics_summary' in impact:
            metrics = impact['metrics_summary']
            print(f"Total executions: {metrics.get('total_executions', 0)}")
            print(f"Success rate: {metrics.get('success_rate', 0):.2%}")
            print(f"Avg execution time: {metrics.get('avg_execution_time', 0):.3f}s")
    except Exception as e:
        print(f"Error analyzing {component}: {e}")

## 2. Performance Visualization

Visualize component performance metrics.

In [None]:
# Extract performance data
perf_data = []
for component, impact in impact_reports.items():
    if 'metrics_summary' in impact:
        metrics = impact['metrics_summary']
        perf_data.append({
            'Component': component,
            'Execution Time (s)': metrics.get('avg_execution_time', 0),
            'Success Rate': metrics.get('success_rate', 0),
            'Executions': metrics.get('total_executions', 0)
        })

if perf_data:
    df = pd.DataFrame(perf_data)
    
    # Create subplots
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Execution time comparison
    df.plot(x='Component', y='Execution Time (s)', kind='bar', ax=axes[0], legend=False)
    axes[0].set_title('Average Execution Time')
    axes[0].set_ylabel('Time (seconds)')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Success rate comparison
    df.plot(x='Component', y='Success Rate', kind='bar', ax=axes[1], legend=False, color='green')
    axes[1].set_title('Success Rate')
    axes[1].set_ylabel('Success Rate')
    axes[1].set_ylim(0, 1.1)
    axes[1].tick_params(axis='x', rotation=45)
    
    # Execution count
    df.plot(x='Component', y='Executions', kind='bar', ax=axes[2], legend=False, color='orange')
    axes[2].set_title('Total Executions')
    axes[2].set_ylabel('Count')
    axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("No performance data available")

## 3. Component Contributions

Analyze what each component contributes to the extraction pipeline.

In [None]:
# Analyze contributions
contribution_data = []
for component, impact in impact_reports.items():
    if 'contributions' in impact:
        for contrib in impact['contributions']:
            contribution_data.append({
                'Component': component,
                'Type': contrib['type'],
                'Count': contrib['total_count'],
                'Occurrences': contrib['occurrences']
            })

if contribution_data:
    contrib_df = pd.DataFrame(contribution_data)
    
    # Pivot for visualization
    pivot_df = contrib_df.pivot_table(
        index='Component', 
        columns='Type', 
        values='Count', 
        fill_value=0
    )
    
    # Create stacked bar chart
    ax = pivot_df.plot(kind='bar', stacked=True, figsize=(10, 6))
    ax.set_title('Component Contributions by Type')
    ax.set_ylabel('Total Contributions')
    ax.tick_params(axis='x', rotation=45)
    plt.legend(title='Contribution Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
else:
    print("No contribution data available")

## 4. Version Comparison

Compare different versions of components to track improvements or regressions.

In [None]:
# Example: Compare two versions of a component
component_name = "segment_preprocessor"
v1 = "1.0.0"
v2 = "1.1.0"

try:
    comparison = compare_component_versions(component_name, v1, v2)
    
    print(f"\nVersion Comparison: {component_name}")
    print(f"Recommendation: {comparison.get('recommendation', 'No data')}")
    
    if 'version_comparison' in comparison:
        for version, metrics in comparison['version_comparison'].items():
            print(f"\nVersion {version}:")
            print(f"  Executions: {metrics.get('total_executions', 0)}")
            print(f"  Avg time: {metrics.get('avg_execution_time', 0):.3f}s")
            print(f"  Success rate: {metrics.get('success_rate', 0):.2%}")
except Exception as e:
    print(f"Error comparing versions: {e}")

## 5. Redundancy Analysis

Identify components with overlapping functionality.

In [None]:
# Find redundant components
redundancies = identify_redundancies()

if redundancies:
    print("\nPotential Redundancies Found:")
    print("=" * 60)
    
    for r in redundancies[:5]:  # Show top 5
        print(f"\n{r['component1']} <-> {r['component2']}")
        print(f"  Shared contribution: {r['shared_contribution']}")
        print(f"  Overlap ratio: {r['overlap_ratio']:.2%}")
        
    # Visualize redundancy network
    if len(redundancies) > 0:
        # Create adjacency matrix for visualization
        components = set()
        for r in redundancies:
            components.add(r['component1'])
            components.add(r['component2'])
        
        components = sorted(list(components))
        n = len(components)
        
        if n > 1:
            import numpy as np
            
            matrix = np.zeros((n, n))
            for r in redundancies:
                i = components.index(r['component1'])
                j = components.index(r['component2'])
                matrix[i, j] = r['overlap_ratio']
                matrix[j, i] = r['overlap_ratio']
            
            # Plot heatmap
            plt.figure(figsize=(8, 6))
            sns.heatmap(matrix, 
                       xticklabels=components, 
                       yticklabels=components,
                       annot=True, 
                       fmt='.2f',
                       cmap='YlOrRd',
                       vmin=0, vmax=1)
            plt.title('Component Redundancy Matrix')
            plt.tight_layout()
            plt.show()
else:
    print("No redundancies found")

## 6. Decision Matrix

Create a decision matrix to help determine which components to keep or remove.

In [None]:
# Build decision matrix
decision_data = []

for component, impact in impact_reports.items():
    if 'metrics_summary' in impact:
        metrics = impact['metrics_summary']
        
        # Calculate scores
        performance_score = 1.0 - min(metrics.get('avg_execution_time', 0) / 5.0, 1.0)
        reliability_score = metrics.get('success_rate', 0)
        
        # Count contributions
        contribution_score = 0
        if 'contributions' in impact:
            total_contributions = sum(c['total_count'] for c in impact['contributions'])
            contribution_score = min(total_contributions / 100, 1.0)
        
        # Overall score
        overall_score = (performance_score + reliability_score + contribution_score) / 3
        
        decision_data.append({
            'Component': component,
            'Performance': performance_score,
            'Reliability': reliability_score,
            'Contribution': contribution_score,
            'Overall': overall_score,
            'Recommendation': impact.get('recommendation', 'No data')
        })

if decision_data:
    decision_df = pd.DataFrame(decision_data)
    decision_df = decision_df.sort_values('Overall', ascending=False)
    
    # Display decision matrix
    print("\nComponent Decision Matrix")
    print("=" * 80)
    print(decision_df.to_string(index=False))
    
    # Visualize scores
    fig, ax = plt.subplots(figsize=(10, 6))
    
    x = np.arange(len(decision_df))
    width = 0.2
    
    ax.bar(x - width, decision_df['Performance'], width, label='Performance')
    ax.bar(x, decision_df['Reliability'], width, label='Reliability')
    ax.bar(x + width, decision_df['Contribution'], width, label='Contribution')
    
    ax.set_xlabel('Component')
    ax.set_ylabel('Score (0-1)')
    ax.set_title('Component Scores by Category')
    ax.set_xticks(x)
    ax.set_xticklabels(decision_df['Component'], rotation=45)
    ax.legend()
    ax.set_ylim(0, 1.1)
    
    plt.tight_layout()
    plt.show()
else:
    print("No decision data available")

## 7. Export Report

Export the analysis results for documentation.

In [None]:
# Generate summary report
report = {
    "analysis_date": pd.Timestamp.now().isoformat(),
    "components_analyzed": len(impact_reports),
    "recommendations": {},
    "redundancies_found": len(redundancies),
    "decision_summary": []
}

# Add recommendations
for component, impact in impact_reports.items():
    report["recommendations"][component] = impact.get('recommendation', 'No data')

# Add decision summary
if decision_data:
    for item in decision_data:
        if item['Overall'] < 0.3:
            action = "REMOVE"
        elif item['Overall'] < 0.6:
            action = "OPTIMIZE"
        else:
            action = "KEEP"
        
        report["decision_summary"].append({
            "component": item['Component'],
            "score": round(item['Overall'], 2),
            "action": action
        })

# Save report
import json
with open('component_analysis_report.json', 'w') as f:
    json.dump(report, f, indent=2)

print("\nAnalysis Summary:")
print(f"Components analyzed: {report['components_analyzed']}")
print(f"Redundancies found: {report['redundancies_found']}")
print("\nRecommended Actions:")
for item in report['decision_summary']:
    print(f"  {item['component']}: {item['action']} (score: {item['score']})")
print("\nFull report saved to: component_analysis_report.json")