# DIMS Interactive Explorer

**Data Inventory Management System - Interactive Analysis & Visualization**

This notebook provides comprehensive tools for:
- 📊 Metric overview and analysis
- 📈 Historical trend visualization
- 🔍 Verification run exploration
- ✅ Approval workflow tracking
- 📝 Event log analysis
- 💾 Data export utilities

---

## 1. Setup & Initialization

In [None]:
# Import required libraries
import sys
import os
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

print("✓ Libraries imported successfully")

In [None]:
# Add project to path
project_root = Path.home() / 'nba-simulator-aws'
sys.path.insert(0, str(project_root / 'scripts' / 'monitoring'))

# Import DIMS modules
from dims.core import DIMSCore
from dims.notebook_utils import DIMSNotebookHelper

print(f"✓ DIMS modules loaded from {project_root}")

In [None]:
# Initialize DIMS
dims = DIMSCore(project_root=str(project_root))
helper = DIMSNotebookHelper(dims)

print("✓ DIMS initialized")
print(f"  Project: {dims.config.get('project_root')}")
print(f"  Database: {'Connected' if dims.database else 'Not available'}")
print(f"  Total Metrics: {len(dims.metrics)}")

## 2. System Overview

In [None]:
# Get system health metrics
health = helper.get_system_health()

print("📊 DIMS SYSTEM HEALTH")
print("=" * 60)
print(f"Total Metrics Defined:     {health['total_metrics']}")
print(f"Metrics with Data:         {health['metrics_with_data']}")
print(f"Recent Verifications (7d): {health['recent_verifications']}")
print(f"Pending Approvals:         {health['pending_approvals']}")
print(f"Recent Events (7d):        {health['recent_events']}")
print(f"Avg Execution Time:        {health['avg_execution_time_ms']:.0f} ms")
print("=" * 60)

In [None]:
# Visualize metrics by category
fig = helper.plot_metrics_overview()
fig.show()

## 3. Latest Metrics

In [None]:
# Get all latest metrics
df_latest = helper.get_latest_metrics()

print(f"\n📈 LATEST METRICS ({len(df_latest)} total)")
print("=" * 80)
df_latest

In [None]:
# Filter by category (interactive)
category = 'code_base'  # Change this to explore different categories

df_category = df_latest[df_latest['metric_category'] == category]
print(f"\n{category.upper()} METRICS")
print("=" * 80)
df_category

## 4. Metric Trend Analysis

In [None]:
# Plot trend for a specific metric (customize these)
category = 'code_base'
metric = 'python_files'
days = 30

fig = helper.plot_metric_trend(category, metric, days)
fig.show()

In [None]:
# Get drift summary for all metrics
df_drift = helper.get_drift_summary(days=30)

print("\n📉 DRIFT ANALYSIS (30 days)")
print("=" * 80)

# Sort by absolute drift
df_drift['abs_drift'] = df_drift['drift_pct'].abs()
df_drift_sorted = df_drift.sort_values('abs_drift', ascending=False)

df_drift_sorted.drop('abs_drift', axis=1)

In [None]:
# Visualize drift across all metrics
fig = helper.plot_drift_heatmap(days=30)
fig.show()

## 5. Verification Run History

In [None]:
# Get recent verification runs
df_verifications = helper.get_verification_runs(days=30)

print(f"\n🔍 VERIFICATION RUNS ({len(df_verifications)} runs in last 30 days)")
print("=" * 80)
df_verifications

In [None]:
# Visualize verification timeline
fig = helper.plot_verification_timeline(days=30)
fig.show()

In [None]:
# Verification statistics
if not df_verifications.empty:
    print("\n📊 VERIFICATION STATISTICS")
    print("=" * 60)
    print(f"Total Runs:               {len(df_verifications)}")
    print(f"Runs with Drift:          {df_verifications['drift_detected'].sum()}")
    print(f"Runs with Auto-Update:    {df_verifications['auto_updated'].sum()}")
    print(f"Avg Metrics Verified:     {df_verifications['metrics_verified'].mean():.1f}")
    print(f"Avg Execution Time:       {df_verifications['execution_time_ms'].mean()/1000:.1f}s")
    print(f"Min Execution Time:       {df_verifications['execution_time_ms'].min()/1000:.1f}s")
    print(f"Max Execution Time:       {df_verifications['execution_time_ms'].max()/1000:.1f}s")
    print("=" * 60)

## 6. Approval Workflow

In [None]:
# Get approval history
df_approvals = helper.get_approval_log(days=30)

print(f"\n✅ APPROVAL WORKFLOW ({len(df_approvals)} requests in last 30 days)")
print("=" * 80)
df_approvals

In [None]:
# Visualize approval status
fig = helper.plot_approval_status(days=30)
fig.show()

In [None]:
# Approval statistics
if not df_approvals.empty:
    print("\n📊 APPROVAL STATISTICS")
    print("=" * 60)
    print(f"Total Requests:           {len(df_approvals)}")
    print(f"Pending:                  {(df_approvals['status'] == 'pending').sum()}")
    print(f"Approved:                 {(df_approvals['status'] == 'approved').sum()}")
    print(f"Rejected:                 {(df_approvals['status'] == 'rejected').sum()}")
    print(f"Avg Drift:                {df_approvals['drift_pct'].mean():.2f}%")
    print(f"Max Drift:                {df_approvals['drift_pct'].max():.2f}%")
    print("=" * 60)

## 7. Event Log Analysis

In [None]:
# Get event log
df_events = helper.get_event_log(days=7)

print(f"\n📝 EVENT LOG ({len(df_events)} events in last 7 days)")
print("=" * 80)
df_events

In [None]:
# Event statistics
if not df_events.empty:
    print("\n📊 EVENT STATISTICS")
    print("=" * 60)
    print(f"Total Events:             {len(df_events)}")
    print(f"Successful:               {df_events['success'].sum()}")
    print(f"Failed:                   {(~df_events['success']).sum()}")
    print(f"Total Metrics Updated:    {df_events['metrics_updated'].sum()}")
    print("\nEvents by Type:")
    print(df_events.groupby('event_type').size())
    print("=" * 60)

## 8. Custom Queries

Use this section for ad-hoc queries and custom analysis.

In [None]:
# Example: Find metrics with highest volatility
df_drift = helper.get_drift_summary(days=30)

if not df_drift.empty:
    top_volatile = df_drift.nlargest(10, 'volatility_pct')[['category', 'metric', 'volatility_pct', 'drift_pct']]
    
    print("\n🔥 TOP 10 MOST VOLATILE METRICS")
    print("=" * 80)
    top_volatile

In [None]:
# Example: Custom database query
if dims.database:
    query = """
        SELECT 
            metric_category,
            COUNT(*) as metric_count,
            AVG(numeric_value) as avg_value
        FROM dims_metrics_latest
        WHERE numeric_value IS NOT NULL
        GROUP BY metric_category
        ORDER BY metric_count DESC
    """
    
    conn = dims.database.pool.getconn()
    try:
        df_custom = pd.read_sql_query(query, conn)
        print("\n📊 METRICS BY CATEGORY (with averages)")
        print("=" * 80)
        display(df_custom)
    finally:
        dims.database.pool.putconn(conn)

In [None]:
# Example: Compare multiple metrics on same chart
metrics_to_compare = [
    ('code_base', 'python_files'),
    ('code_base', 'test_files'),
    ('documentation', 'markdown_files')
]

fig = go.Figure()

for category, metric in metrics_to_compare:
    df_history = helper.get_metric_history(category, metric, days=30)
    if not df_history.empty:
        fig.add_trace(go.Scatter(
            x=df_history['recorded_at'],
            y=df_history['numeric_value'],
            mode='lines+markers',
            name=f"{category}.{metric}"
        ))

fig.update_layout(
    title="Multi-Metric Comparison",
    xaxis_title="Date",
    yaxis_title="Value",
    template='plotly_white',
    height=500
)

fig.show()

## 9. Data Export

In [None]:
# Export latest metrics to CSV
df_latest = helper.get_latest_metrics()
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"dims_metrics_{timestamp}.csv"

path = helper.export_to_csv(df_latest, filename)
print(f"✓ Exported to: {path}")

In [None]:
# Export comprehensive report to Excel
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"dims_report_{timestamp}.xlsx"

# Gather all data
dataframes = {
    'Latest Metrics': helper.get_latest_metrics(),
    'Drift Summary': helper.get_drift_summary(days=30),
    'Verification Runs': helper.get_verification_runs(days=30),
    'Approvals': helper.get_approval_log(days=30),
    'Events': helper.get_event_log(days=7)
}

# Remove empty DataFrames
dataframes = {k: v for k, v in dataframes.items() if not v.empty}

path = helper.export_to_excel(dataframes, filename)
print(f"✓ Comprehensive report exported to: {path}")
print(f"  Sheets: {', '.join(dataframes.keys())}")

## 10. Summary & Recommendations

Based on the analysis above, generate insights and recommendations.

In [None]:
# Generate automated recommendations
df_drift = helper.get_drift_summary(days=30)

print("\n🎯 AUTOMATED RECOMMENDATIONS")
print("=" * 80)

if not df_drift.empty:
    # High drift warnings
    high_drift = df_drift[df_drift['drift_pct'].abs() > 15]
    if not high_drift.empty:
        print("\n⚠️  HIGH DRIFT DETECTED:")
        for _, row in high_drift.iterrows():
            print(f"   • {row['category']}.{row['metric']}: {row['drift_pct']:.1f}% drift")
    
    # High volatility warnings
    high_volatility = df_drift[df_drift['volatility_pct'] > 25]
    if not high_volatility.empty:
        print("\n📊 HIGH VOLATILITY DETECTED:")
        for _, row in high_volatility.iterrows():
            print(f"   • {row['category']}.{row['metric']}: {row['volatility_pct']:.1f}% volatility")
    
    # Stable metrics
    stable = df_drift[df_drift['drift_pct'].abs() < 1]
    if not stable.empty:
        print(f"\n✅ STABLE METRICS: {len(stable)} metrics with <1% drift")

print("\n" + "=" * 80)
print("\n💡 For detailed analysis, explore specific sections above.")
print("📊 Use the Custom Queries section for ad-hoc investigations.")
print("💾 Export data using the Export section for external analysis.")

---

## Next Steps

1. **Customize Analysis**: Modify the cells above to focus on specific metrics or time periods
2. **Add Custom Queries**: Use Section 8 to write custom SQL queries
3. **Export Reports**: Use Section 9 to generate reports for stakeholders
4. **Schedule Runs**: Set up automated notebook execution for regular reporting

**Documentation**: See `docs/DIMS_JUPYTER_GUIDE.md` for comprehensive guide

**CLI Commands**:
```bash
# Launch notebook
dims_cli.py notebook

# Export notebook as HTML report
dims_cli.py notebook export
```

In [None]:
# Import workflow integration
from dims.workflow_integration import WorkflowIntegration

workflow = WorkflowIntegration(str(project_root))

print("✓ Workflow integration loaded")

In [None]:
# Run file inventory workflow (Workflow #13)
result = workflow.run_file_inventory(update=True)

print("\n📁 FILE INVENTORY")
print("=" * 80)
print(f"Total files documented: {result['total_files']}")
print(f"Last updated: {result['last_updated']}")
print("\nCategories:")
for category, count in result['categories'].items():
    print(f"  - {category}: {count}")
print("=" * 80)

In [None]:
# Run local data inventory (Workflow #45)
result = workflow.run_local_data_inventory(mode='quick')

print("\n💾 LOCAL DATA INVENTORY")
print("=" * 80)
print(f"Archives: {result['archives_size_gb']} GB")
print(f"Temp data: {result['temp_size_gb']} GB")
print(f"Project: {result['project_size_gb']} GB")
print(f"Total: {result['archives_size_gb'] + result['temp_size_gb'] + result['project_size_gb']} GB")
print("=" * 80)

In [None]:
# Run AWS inventory (Workflow #47)
result = workflow.run_aws_data_inventory()

print("\n☁️  AWS DATA INVENTORY")
print("=" * 80)
print(f"\nS3:")
print(f"  Objects: {result['s3_objects']:,}")
print(f"  Size: {result['s3_size_gb']:.2f} GB")
print(f"\nRDS:")
print(f"  Database size: {result['rds_size_gb']:.2f} GB")
print(f"  Allocated storage: {result['rds_allocated_gb']} GB")
print(f"\n💰 Estimated monthly cost: ${result['estimated_cost_usd']:.2f}")
print("=" * 80)

# Visualize cost breakdown
fig = go.Figure(data=[go.Pie(
    labels=['S3 Storage', 'RDS Database'],
    values=[result['s3_size_gb'] * 0.023, 20.0],
    hole=0.3
)])
fig.update_layout(title='Monthly Cost Breakdown')
fig.show()

In [None]:
# Run data gap analysis (Workflow #46)
result = workflow.run_data_gap_analysis()

print("\n🔍 DATA GAP ANALYSIS")
print("=" * 80)
print(f"Total games: {result['total_games']}")
print(f"Missing box scores: {result['missing_games']}")
print(f"Missing play-by-play: {result['games_without_pbp']}")

if result['missing_games'] > 0 or result['games_without_pbp'] > 0:
    print("\n⚠️  Data gaps detected!")
else:
    print("\n✅ No data gaps found")
print("=" * 80)

# Visualize data completeness
if result['total_games'] > 0:
    completeness = [
        result['total_games'] - result['missing_games'],
        result['total_games'] - result['games_without_pbp']
    ]
    labels = ['Box Scores', 'Play-by-Play']
    
    fig = go.Figure(data=[go.Bar(
        x=labels,
        y=completeness,
        text=[f"{(c/result['total_games']*100):.1f}%" for c in completeness],
        textposition='auto'
    )])
    fig.update_layout(
        title='Data Completeness by Type',
        yaxis_title='Games with Data',
        template='plotly_white'
    )
    fig.show()

## 12. Sync Status Dashboard

Monitor synchronization between local files and S3.

In [None]:
# Run sync status check (Workflow #49)
result = workflow.run_sync_status_check()

print("\n🔄 SYNC STATUS")
print("=" * 80)
print(f"S3 files: {result['s3_files']:,}")
print(f"Local files: {result['local_files']:,}")
print(f"Drift: {result['drift_pct']:.1f}%")
print(f"Status: {result['status'].upper()}")

if result['status'] == 'synced':
    print("\n✅ Local and S3 are synchronized")
elif result['status'] in ['minor_drift', 'moderate_drift']:
    print("\n⚠️  Drift detected - consider running sync")
elif result['status'] == 'major_drift':
    print("\n🔴 MAJOR drift - URGENT sync recommended")
print("=" * 80)

In [None]:
# Visualize sync status
fig = go.Figure()

fig.add_trace(go.Bar(
    name='S3',
    x=['File Count'],
    y=[result['s3_files']],
    marker_color='lightblue'
))

fig.add_trace(go.Bar(
    name='Local',
    x=['File Count'],
    y=[result['local_files']],
    marker_color='lightgreen'
))

fig.update_layout(
    title='S3 vs Local File Counts',
    yaxis_title='Number of Files',
    barmode='group',
    template='plotly_white'
)

fig.show()

# Status indicator
status_colors = {
    'synced': 'green',
    'minor_drift': 'yellow',
    'moderate_drift': 'orange',
    'major_drift': 'red'
}

fig = go.Figure(go.Indicator(
    mode='gauge+number+delta',
    value=result['drift_pct'],
    domain={'x': [0, 1], 'y': [0, 1]},
    title={'text': 'Sync Drift %'},
    delta={'reference': 0},
    gauge={
        'axis': {'range': [None, 50]},
        'bar': {'color': status_colors.get(result['status'], 'gray')},
        'steps': [
            {'range': [0, 5], 'color': 'lightgreen'},
            {'range': [5, 15], 'color': 'lightyellow'},
            {'range': [15, 30], 'color': 'orange'},
            {'range': [30, 50], 'color': 'red'}
        ],
        'threshold': {
            'line': {'color': 'red', 'width': 4},
            'thickness': 0.75,
            'value': 30
        }
    }
))

fig.update_layout(height=300)
fig.show()

In [None]:
# Run all workflows and generate comprehensive report
all_results = workflow.run_all_workflows()

print("\n📊 COMPREHENSIVE WORKFLOW REPORT")
print("=" * 80)
print(f"Timestamp: {all_results['timestamp']}")
print("\nWorkflows Run:")
for workflow_name in all_results['workflows'].keys():
    print(f"  ✓ {workflow_name}")
print("\n" + "=" * 80)

# Export to Excel
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"workflow_report_{timestamp}.xlsx"

# Convert workflow results to DataFrames for export
export_data = {}

if 'file_inventory' in all_results['workflows']:
    df_files = pd.DataFrame([all_results['workflows']['file_inventory']['categories']])
    export_data['File Inventory'] = df_files

if 'aws_inventory' in all_results['workflows']:
    df_aws = pd.DataFrame([all_results['workflows']['aws_inventory']])
    export_data['AWS Inventory'] = df_aws

if 'data_gaps' in all_results['workflows']:
    df_gaps = pd.DataFrame([all_results['workflows']['data_gaps']])
    export_data['Data Gaps'] = df_gaps

if 'sync_status' in all_results['workflows']:
    df_sync = pd.DataFrame([all_results['workflows']['sync_status']])
    export_data['Sync Status'] = df_sync

if export_data:
    path = helper.export_to_excel(export_data, filename)
    print(f"\n✓ Workflow report exported: {path}")

## 11. Workflow Integration

Run existing data inventory workflows integrated with DIMS.