In [None]:
pip install pandas numpy plotly scikit-learn seaborn statsmodels

## Initial Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from datetime import datetime

# Read our existing dataset
df = pd.read_csv('../01-firefighting-to-forecasting/api_performance_data.csv')

# Calculate key statistics per endpoint
stats_df = df.groupby('endpoint').agg({
    'requests': ['mean', 'std', 'min', 'max'],
    'latency_ms': ['mean', 'std'],
    'error_rate': 'mean',
    'cpu_utilization': 'mean',
    'memory_utilization': 'mean'
}).round(2)

# Flatten column names and reset index
stats_df.columns = ['_'.join(col).strip() for col in stats_df.columns.values]
stats_df = stats_df.reset_index()

# Format the columns for better readability
for col in stats_df.columns:
    if 'requests' in col:
        stats_df[col] = stats_df[col].apply(lambda x: f"{x:,.0f}")
    elif 'latency' in col:
        stats_df[col] = stats_df[col].apply(lambda x: f"{x:.1f}")
    elif 'error' in col or 'utilization' in col:
        stats_df[col] = stats_df[col].apply(lambda x: f"{x:.2f}")

# Create the table with fixed column widths and full data display
fig = go.Figure(data=[go.Table(
    header=dict(
        values=['<b>Metric</b>'] + [f"<b>{endpoint}</b>" for endpoint in stats_df['endpoint']],
        font=dict(size=12, color='rgb(55, 65, 81)'),
        fill_color='rgb(244, 246, 248)',
        align=['left'] * (len(stats_df) + 1),
        height=36
    ),
    cells=dict(
        values=[
            ['Avg Requests', 'Std Requests', 'Min Requests', 'Max Requests',
             'Avg Latency', 'Std Latency', 'Error Rate %', 'Avg CPU %', 'Avg Mem %']] +
            [stats_df[col] for col in stats_df.columns if col != 'endpoint'],
        font=dict(size=11, color='rgb(75, 85, 99)'),
        fill_color='rgb(255, 255, 255)',
        align=['left'] * (len(stats_df) + 1),
        height=30
    ),
    columnwidth=[150] + [100] * len(stats_df)  # Fixed widths for all columns
)])

# Update layout for a cleaner look
fig.update_layout(
    width=1000,  # Width to accommodate all columns
    height=400,  # Height to ensure all rows are visible
    margin=dict(l=0, r=0, t=0, b=0),
    paper_bgcolor='rgba(0,0,0,0)'
)

# Save as PNG with high resolution
fig.write_image("api_metrics_summary.png", scale=2)

# Display in notebook
fig.show()

# Also capture the overall time range for our article
time_range = {
    'start': pd.to_datetime(df['timestamp'].min()).strftime('%Y-%m-%d'),
    'end': pd.to_datetime(df['timestamp'].max()).strftime('%Y-%m-%d'),
    'total_records': len(df)
}

print("\nDataset Coverage:")
print(f"From {time_range['start']} to {time_range['end']}")
print(f"Total records: {time_range['total_records']:,}")

## Pattern Visualization

In [19]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def create_enhanced_visualization(df):
    # Create figure with secondary y-axis
    fig = make_subplots(
        rows=2, cols=2,
        specs=[[{"colspan": 2}, None],
               [{"type": "heatmap"}, {"type": "bar"}]],
        subplot_titles=("API Endpoint Performance Overview", 
                       "Metric Correlations", "Resource Usage by Endpoint"),
        vertical_spacing=0.12,
        horizontal_spacing=0.1
    )
    
    # Add endpoint performance line plot
    for endpoint in df['endpoint'].unique():
        endpoint_data = df[df['endpoint'] == endpoint]
        fig.add_trace(
            go.Scatter(
                name=endpoint,
                x=endpoint_data['requests'],
                y=endpoint_data['latency_ms'],
                mode='markers+lines',
                marker=dict(size=6),
                opacity=0.7
            ),
            row=1, col=1
        )

    # Add correlation heatmap
    metric_cols = ['requests', 'latency_ms', 'error_rate', 
                  'cpu_utilization', 'memory_utilization']
    corr = df[metric_cols].corr()
    
    fig.add_trace(
        go.Heatmap(
            z=corr.values,
            x=corr.columns,
            y=corr.columns,
            colorscale="RdBu",
            zmin=-1, zmax=1
        ),
        row=2, col=1
    )

    # Add resource usage bar chart
    resource_stats = df.groupby('endpoint')[['cpu_utilization', 'memory_utilization']].mean()
    
    fig.add_trace(
        go.Bar(
            name="CPU",
            x=resource_stats.index,
            y=resource_stats['cpu_utilization'],
            marker_color='rgb(55, 83, 109)'
        ),
        row=2, col=2
    )

    fig.add_trace(
        go.Bar(
            name="Memory",
            x=resource_stats.index,
            y=resource_stats['memory_utilization'],
            marker_color='rgb(26, 118, 255)'
        ),
        row=2, col=2
    )

    # Update layout
    fig.update_layout(
        height=1000,
        width=1200,
        showlegend=True,
        template='plotly_white',
        title_text="API Performance Multi-Dimensional Analysis",
        title_x=0.5,
        title_font_size=20
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Requests", row=1, col=1)
    fig.update_yaxes(title_text="Latency (ms)", row=1, col=1)
    fig.update_xaxes(title_text="Metrics", row=2, col=1)
    fig.update_yaxes(title_text="Metrics", row=2, col=1)
    fig.update_xaxes(title_text="Endpoint", row=2, col=2)
    fig.update_yaxes(title_text="Utilization %", row=2, col=2)

    return fig

# Generate and save visualization
vis_fig = create_enhanced_visualization(df)
vis_fig.write_image("api_performance_analysis.png", scale=2)

## Building the Model

In [None]:
# Link to GitHub cell: 
# https://github.com/stackgazer/stats-for-engineers/blob/main/1_regression/02_multi_factor.ipynb#Model-Visualization

import plotly.graph_objects as go
from plotly.subplots import make_subplots

def create_model_analysis_plot(y_test, y_pred, feature_names, coefficients):
    # Create figure with secondary y-axis
    fig = make_subplots(rows=2, cols=2, 
                       subplot_titles=('Predicted vs Actual Latency',
                                     'Residual Plot',
                                     'Feature Importance',
                                     'Prediction Error Distribution'))
    
    # Predicted vs Actual
    fig.add_trace(
        go.Scatter(x=y_test, y=y_pred, mode='markers',
                  name='Predictions', marker=dict(color='#2E86C1')),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(x=[y_test.min(), y_test.max()],
                  y=[y_test.min(), y_test.max()],
                  mode='lines', name='Perfect Prediction',
                  line=dict(dash='dash', color='gray')),
        row=1, col=1
    )
    
    # Residual Plot
    residuals = y_test - y_pred
    fig.add_trace(
        go.Scatter(x=y_pred, y=residuals, mode='markers',
                  name='Residuals', marker=dict(color='#E74C3C')),
        row=1, col=2
    )
    
    # Feature Importance
    fig.add_trace(
        go.Bar(x=feature_names, y=np.abs(coefficients),
               name='|Coefficient|', marker_color='#27AE60'),
        row=2, col=1
    )
    
    # Error Distribution
    fig.add_trace(
        go.Histogram(x=residuals, name='Error Distribution',
                    marker_color='#8E44AD'),
        row=2, col=2
    )
    
    # Update layout
    fig.update_layout(height=800, width=1000, showlegend=False,
                     title_text="Multi-Factor Model Analysis")
    
    return fig

# Create and save visualization
feature_names = ['Requests', 'Error Rate', 'CPU Util', 'Memory Util']
vis_fig = create_model_analysis_plot(y_test, y_pred, feature_names, multi_model.coef_)
vis_fig.write_image("multi_factor_analysis.png", scale=2)