# SRE Performance Analysis Notebook

This notebook provides interactive analysis of SRE metrics and performance data collected from AppDynamics and other monitoring sources.

## Contents
1. Data Loading and Exploration
2. SLO/SLA Analysis
3. Trend Analysis
4. Incident Correlation
5. Predictive Analytics

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime, timedelta
import json

# Import our SRE modules
from collectors.oauth_appdynamics_collector import OAuthAppDynamicsCollector
from reports.enhanced_sre_report_system import EnhancedSREReportSystem

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Loading and Exploration

In [None]:
# Load the latest SRE report data
def load_latest_report_data():
    """Load the most recent SRE report data"""
    reports_dir = '../reports/generated'
    
    # Find the latest JSON report
    json_files = [f for f in os.listdir(reports_dir) if f.endswith('.json') and 'sre_data' in f]
    if not json_files:
        print("No report data found. Generate a report first.")
        return None
    
    latest_file = max(json_files)
    file_path = os.path.join(reports_dir, latest_file)
    
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    print(f"Loaded data from: {latest_file}")
    return data

# Load data
sre_data = load_latest_report_data()
if sre_data:
    print(f"Report generated at: {sre_data['report_metadata']['generated_at']}")
    print(f"Application: {sre_data['report_metadata']['application_name']}")

In [None]:
# Convert SLO metrics to DataFrame for analysis
if sre_data and 'slo_metrics' in sre_data:
    metrics_df = pd.DataFrame(sre_data['slo_metrics'])
    
    # Convert timestamp to datetime
    metrics_df['timestamp'] = pd.to_datetime(metrics_df['timestamp'])
    
    print(f"Loaded {len(metrics_df)} metrics")
    print("\nMetrics Summary:")
    print(metrics_df.groupby(['service_name', 'metric_name'])['status'].value_counts())
    
    # Display first few rows
    metrics_df.head()

## 2. SLO/SLA Compliance Analysis

In [None]:
# SLO Compliance Dashboard
if 'metrics_df' in locals():
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Compliance by Service
    compliance_by_service = metrics_df.groupby('service_name')['status'].apply(
        lambda x: (x == 'compliant').sum() / len(x) * 100
    )
    
    axes[0, 0].bar(compliance_by_service.index, compliance_by_service.values)
    axes[0, 0].set_title('SLO Compliance by Service (%)')
    axes[0, 0].set_ylabel('Compliance %')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # 2. Error Budget Consumption
    error_budget = metrics_df.groupby('service_name')['error_budget_consumed'].mean()
    colors = ['red' if x > 75 else 'orange' if x > 50 else 'green' for x in error_budget.values]
    
    axes[0, 1].bar(error_budget.index, error_budget.values, color=colors)
    axes[0, 1].set_title('Error Budget Consumption (%)')
    axes[0, 1].set_ylabel('Budget Consumed %')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # 3. Current vs Target Values
    availability_metrics = metrics_df[metrics_df['metric_name'] == 'availability']
    x_pos = np.arange(len(availability_metrics))
    
    axes[1, 0].bar(x_pos - 0.2, availability_metrics['current_value'], 0.4, 
                   label='Current', alpha=0.7)
    axes[1, 0].bar(x_pos + 0.2, availability_metrics['slo_target'], 0.4, 
                   label='SLO Target', alpha=0.7)
    axes[1, 0].set_xlabel('Services')
    axes[1, 0].set_ylabel('Availability %')
    axes[1, 0].set_title('Availability: Current vs SLO Target')
    axes[1, 0].set_xticks(x_pos)
    axes[1, 0].set_xticklabels(availability_metrics['service_name'], rotation=45)
    axes[1, 0].legend()
    
    # 4. Status Distribution
    status_counts = metrics_df['status'].value_counts()
    colors_pie = {'compliant': 'green', 'at_risk': 'orange', 'breached': 'red'}
    pie_colors = [colors_pie.get(status, 'gray') for status in status_counts.index]
    
    axes[1, 1].pie(status_counts.values, labels=status_counts.index, 
                   colors=pie_colors, autopct='%1.1f%%')
    axes[1, 1].set_title('Overall SLO Status Distribution')
    
    plt.tight_layout()
    plt.show()
else:
    print("No metrics data available for analysis")

## 3. Trend Analysis

In [None]:
# Generate trend analysis using historical data
def analyze_trends():
    """Analyze performance trends over time"""
    
    # Generate sample trend data (in production, this would come from time series DB)
    services = ['web-service', 'api-service', 'auth-service', 'database-service']
    metrics = ['availability', 'latency_p95', 'error_rate']
    
    # Create 30 days of sample data
    dates = pd.date_range(start='2024-08-01', periods=30, freq='D')
    
    trend_data = []
    
    for service in services:
        for metric in metrics:
            if metric == 'availability':
                # Availability trend (99.5% - 99.99%)
                base_value = 99.9
                values = base_value + np.random.normal(0, 0.05, len(dates))
                values = np.clip(values, 99.5, 99.99)
            elif metric == 'latency_p95':
                # Latency trend (100ms - 300ms)
                base_value = 200
                values = base_value + np.random.normal(0, 20, len(dates))
                values = np.clip(values, 100, 300)
            else:  # error_rate
                # Error rate trend (0% - 1%)
                base_value = 0.1
                values = base_value + np.abs(np.random.normal(0, 0.05, len(dates)))
                values = np.clip(values, 0, 1)
            
            for date, value in zip(dates, values):
                trend_data.append({
                    'date': date,
                    'service': service,
                    'metric': metric,
                    'value': value
                })
    
    return pd.DataFrame(trend_data)

# Generate and plot trends
trend_df = analyze_trends()

# Create interactive plots with Plotly
for metric in ['availability', 'latency_p95', 'error_rate']:
    metric_data = trend_df[trend_df['metric'] == metric]
    
    fig = px.line(metric_data, x='date', y='value', color='service',
                  title=f'{metric.replace("_", " ").title()} Trend (30 Days)',
                  labels={'value': metric.replace('_', ' ').title()})
    
    fig.update_layout(height=400)
    fig.show()

print("📈 Trend analysis complete")

## 4. Incident Correlation Analysis

In [None]:
# Analyze incident data if available
if sre_data and 'incident' in sre_data and sre_data['incident']:
    incident = sre_data['incident']
    
    print("🚨 Incident Analysis")
    print(f"Incident ID: {incident['incident_id']}")
    print(f"Severity: {incident['severity']}")
    print(f"Duration: {incident['start_time']} to {incident['end_time']}")
    print(f"Affected Services: {', '.join(incident['affected_services'])}")
    print(f"\nRoot Cause: {incident['root_cause']}")
    
    # Create incident timeline visualization
    fig = go.Figure()
    
    start_time = pd.to_datetime(incident['start_time'])
    end_time = pd.to_datetime(incident['end_time']) if incident['end_time'] else datetime.now()
    
    for i, service in enumerate(incident['affected_services']):
        fig.add_shape(
            type="rect",
            x0=start_time, x1=end_time,
            y0=i, y1=i+0.8,
            fillcolor="red", opacity=0.6,
            line=dict(color="red", width=2)
        )
        
        fig.add_annotation(
            x=start_time + (end_time - start_time) / 2,
            y=i + 0.4,
            text=service,
            showarrow=False,
            font=dict(color="white", size=10)
        )
    
    fig.update_layout(
        title="Incident Timeline - Affected Services",
        xaxis_title="Time",
        yaxis_title="Services",
        yaxis=dict(tickmode='array', tickvals=list(range(len(incident['affected_services']))),
                   ticktext=incident['affected_services']),
        height=300
    )
    
    fig.show()
    
    # Display LLM analysis
    if incident.get('llm_analysis'):
        print("\n🤖 AI-Powered Analysis:")
        print(incident['llm_analysis'][:500] + "..." if len(incident['llm_analysis']) > 500 else incident['llm_analysis'])
else:
    print("No incident data available in the current report")

## 5. Predictive Analytics

In [None]:
# Simple predictive analysis using trend data
def predict_slo_breach_risk(trend_df):
    """Predict the risk of SLO breaches based on trends"""
    
    predictions = {}
    
    for service in trend_df['service'].unique():
        service_data = trend_df[trend_df['service'] == service]
        
        risk_scores = {}
        
        for metric in ['availability', 'latency_p95', 'error_rate']:
            metric_data = service_data[service_data['metric'] == metric]
            
            if len(metric_data) > 5:
                # Calculate trend slope
                x = np.arange(len(metric_data))
                y = metric_data['value'].values
                slope = np.polyfit(x, y, 1)[0]
                
                # Calculate volatility (standard deviation)
                volatility = np.std(y)
                
                # Risk calculation (simplified)
                if metric == 'availability':
                    # For availability, negative slope is bad
                    risk = max(0, -slope * 100) + volatility * 10
                else:
                    # For latency and error rate, positive slope is bad
                    risk = max(0, slope * 10) + volatility * 5
                
                risk_scores[metric] = min(100, risk)  # Cap at 100
        
        # Overall risk is the maximum of individual metric risks
        overall_risk = max(risk_scores.values()) if risk_scores else 0
        predictions[service] = {
            'overall_risk': overall_risk,
            'metric_risks': risk_scores,
            'risk_level': 'High' if overall_risk > 70 else 'Medium' if overall_risk > 40 else 'Low'
        }
    
    return predictions

# Generate predictions
predictions = predict_slo_breach_risk(trend_df)

# Display predictions
print("🔮 SLO Breach Risk Predictions (Next 7 Days)")
print("=" * 50)

for service, pred in predictions.items():
    print(f"\n{service}:")
    print(f"  Overall Risk: {pred['overall_risk']:.1f}% ({pred['risk_level']})")
    for metric, risk in pred['metric_risks'].items():
        print(f"  {metric}: {risk:.1f}%")

# Create risk visualization
services = list(predictions.keys())
risks = [predictions[s]['overall_risk'] for s in services]
colors = ['red' if r > 70 else 'orange' if r > 40 else 'green' for r in risks]

fig = go.Figure(data=[
    go.Bar(x=services, y=risks, marker_color=colors)
])

fig.update_layout(
    title="SLO Breach Risk Prediction by Service",
    xaxis_title="Service",
    yaxis_title="Risk Score (%)",
    height=400
)

fig.show()

## 6. Generate New Report

Generate a fresh SRE report with the latest data

In [None]:
# Generate a new comprehensive report
def generate_fresh_report():
    """Generate a new SRE report with current data"""
    
    print("🔄 Generating fresh SRE report...")
    
    try:
        # Initialize report system
        system = EnhancedSREReportSystem(app_name='Interactive Analysis Demo')
        
        # Generate services list
        services = ['web-frontend', 'api-gateway', 'user-service', 'order-service', 'payment-service']
        
        # Create incident scenario (2 hours ago)
        incident_time = datetime.now() - timedelta(hours=2)
        
        # Generate comprehensive report
        report_paths = system.generate_full_report_suite(
            application_name='Interactive Analysis Demo',
            services=services,
            incident_time=incident_time,
            incident_duration=1.0
        )
        
        print("✅ Fresh report generated!")
        for report_type, path in report_paths.items():
            if path:
                print(f"  📊 {report_type.replace('_', ' ').title()}: {path}")
        
        return True
        
    except Exception as e:
        print(f"❌ Report generation failed: {e}")
        return False

# Uncomment to generate a fresh report
# generate_fresh_report()

## Conclusion

This notebook provides a comprehensive analysis framework for SRE performance data. Key capabilities include:

- **Real-time SLO monitoring** with compliance tracking
- **Trend analysis** for performance prediction
- **Incident correlation** with root cause analysis
- **Predictive analytics** for proactive SRE management
- **Interactive visualizations** for better insights

### Next Steps:
1. Connect to live AppDynamics data for real-time analysis
2. Implement more sophisticated ML models for prediction
3. Add automated alerting based on risk predictions
4. Integrate with incident management systems
5. Create custom dashboards for different stakeholders