# Reporting and Automated Reports

This notebook logs key parameters, compiles computed metrics and visualizations, and exports them as an automated report (HTML). 

Before running this notebook, make sure that the earlier notebooks (e.g., Basic Analytics) have run so that key variables (such as `total_cases`, `accuracy`, etc.) are available.

In [20]:
import datetime
import pandas as pd
import numpy as np
from sqlalchemy import text

# Retrieve the stored connection string
%store -r accuracy
%store -r conf_stats
%store -r db_uri
%store -r kappa
%store -r labels
%store -r mc
%store -r mcc
%store -r merged_df
%store -r min_confidence
%store -r min_f1
%store -r model_version
%store -r report
%store -r total_cases

from sqlalchemy import create_engine
engine = create_engine(db_uri)
print("Engine recreated from stored db_uri.")

# Check if key metrics exist, but do not assign default values
try:
    total_cases
except NameError:
    pass

try:
    accuracy
except NameError:
    pass

try:
    model_version
except NameError:
    pass

run_parameters = {
    "run_date": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    "model_version": model_version,
    "num_test_cases": total_cases,
    "accuracy": accuracy
}

print("Run Parameters:")
for key, value in run_parameters.items():
    print(f"{key}: {value}")

Engine recreated from stored db_uri.
Run Parameters:
run_date: 2025-04-12 22:59:10
model_version: 1.0.0
num_test_cases: 99
accuracy: 1.0


In [21]:
print("Fetching results data from the database...")

expected_query = text("""
SELECT message_id, expected_label 
FROM expected_results
ORDER BY message_id
""")

actual_query = text("""
SELECT message_id, predicted_label, confidence 
FROM actual_results 
ORDER BY message_id
""")

try:
    expected_df = pd.read_sql(expected_query, engine)
    actual_df = pd.read_sql(actual_query, engine)

except Exception as e:
    print(f"Error fetching data: {e}")

Fetching results data from the database...


In [23]:
%store -r mc
%store -r mcc
%store -r kappa
%store -r report
%store -r conf_stats
%store -r labels
%store -r min_f1
%store -r min_confidence

In [24]:
def assess_model_quality():
    """Perform senior-level assessment of model quality based on metrics"""
    # Define thresholds for different quality levels
    quality_thresholds = {
        'excellent': {'accuracy': 0.95, 'f1': 0.95, 'mcc': 0.9, 'kappa': 0.9, 'min_conf': 0.8},
        'good': {'accuracy': 0.85, 'f1': 0.85, 'mcc': 0.7, 'kappa': 0.7, 'min_conf': 0.7},
        'acceptable': {'accuracy': 0.75, 'f1': 0.75, 'mcc': 0.5, 'kappa': 0.5, 'min_conf': 0.6},
        'needs_improvement': {'accuracy': 0.65, 'f1': 0.65, 'mcc': 0.3, 'kappa': 0.3, 'min_conf': 0.5}
    }
    
    if (accuracy >= quality_thresholds['excellent']['accuracy'] and 
        min_f1 >= quality_thresholds['excellent']['f1'] and
        mcc >= quality_thresholds['excellent']['mcc'] and
        kappa >= quality_thresholds['excellent']['kappa'] and
        min_confidence >= quality_thresholds['excellent']['min_conf']):
        quality = "EXCELLENT"
    elif (accuracy >= quality_thresholds['good']['accuracy'] and 
          min_f1 >= quality_thresholds['good']['f1'] and
          mcc >= quality_thresholds['good']['mcc'] and
          kappa >= quality_thresholds['good']['kappa'] and
          min_confidence >= quality_thresholds['good']['min_conf']):
        quality = "GOOD"
    elif (accuracy >= quality_thresholds['acceptable']['accuracy'] and 
          min_f1 >= quality_thresholds['acceptable']['f1'] and
          mcc >= quality_thresholds['acceptable']['mcc'] and
          kappa >= quality_thresholds['acceptable']['kappa'] and
          min_confidence >= quality_thresholds['acceptable']['min_conf']):
        quality = "ACCEPTABLE"
    elif (accuracy >= quality_thresholds['needs_improvement']['accuracy'] and 
          min_f1 >= quality_thresholds['needs_improvement']['f1'] and
          mcc >= quality_thresholds['needs_improvement']['mcc'] and
          kappa >= quality_thresholds['needs_improvement']['kappa'] and
          min_confidence >= quality_thresholds['needs_improvement']['min_conf']):
        quality = "NEEDS IMPROVEMENT"
    else:
        quality = "POOR"
    
    if quality in ["EXCELLENT", "GOOD"]:
        readiness = "READY FOR PRODUCTION"
    elif quality == "ACCEPTABLE":
        readiness = "POTENTIALLY READY WITH MONITORING"
    else:
        readiness = "NOT READY FOR PRODUCTION"
    
    strengths = []
    if accuracy >= 0.9:
        strengths.append(f"High overall accuracy ({accuracy:.2%})")
    if min_f1 >= 0.9:
        strengths.append("Strong performance across all classes")
    if mcc >= 0.8:
        strengths.append("Excellent correlation between predictions and ground truth")
    if min_confidence >= 0.8:
        strengths.append("High confidence in predictions")
    
    weaknesses = []
    if accuracy < 0.8:
        weaknesses.append(f"Low overall accuracy ({accuracy:.2%})")
    if min_f1 < 0.8:
        weaknesses.append("Inconsistent performance across classes")
    if mcc < 0.6:
        weaknesses.append("Poor correlation between predictions and ground truth")
    if min_confidence < 0.7:
        weaknesses.append("Low confidence in some predictions")
    
    recommendations = []
    if quality in ["EXCELLENT", "GOOD"]:
        recommendations.append("Deploy model to production environment")
        recommendations.append("Implement regular monitoring to ensure continued performance")
    elif quality == "ACCEPTABLE":
        recommendations.append("Consider deployment with enhanced monitoring")
        recommendations.append("Investigate classes with lower F1 scores for potential improvements")
        recommendations.append("Collect more training data for underperforming classes")
    else:
        recommendations.append("Defer deployment until model quality improves")
        recommendations.append("Review training data for quality and representation issues")
        recommendations.append("Consider model architecture or hyperparameter tuning")
    
    if len(merged_df) >= 100:
        confidence = "HIGH"
    elif len(merged_df) >= 50:
        confidence = "MODERATE"
    else:
        confidence = "LOW (limited test data)"
        recommendations.append("Increase test dataset size for more reliable evaluation")
    
    return {
        "quality": quality,
        "readiness": readiness,
        "strengths": strengths,
        "weaknesses": weaknesses,
        "recommendations": recommendations,
        "confidence": confidence
    }

assessment = assess_model_quality()

print("Model Assessment Analysis:")
print(f"- Overall Quality: {assessment['quality']}")
print(f"- Production Readiness: {assessment['readiness']}")
print(f"- Confidence in Results: {assessment['confidence']}")

Model Assessment Analysis:
- Overall Quality: EXCELLENT
- Production Readiness: READY FOR PRODUCTION
- Confidence in Results: MODERATE


In [25]:
def generate_detailed_report():
    """Generate a detailed HTML report with calculated metrics and assessment"""
    html = "<html>\n"
    html += "<head>\n"
    html += "<title>Test Run Report</title>\n"
    html += "<style>\n"
    html += "body { font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }\n"
    html += "h1 { color: #2c3e50; border-bottom: 1px solid #eee; padding-bottom: 10px; }\n"
    html += "h2 { color: #3498db; margin-top: 30px; }\n"
    html += "h3 { color: #2980b9; }\n"
    html += "table { border-collapse: collapse; width: 100%; margin: 20px 0; }\n"
    html += "th, td { padding: 12px; text-align: left; border: 1px solid #ddd; }\n"
    html += "th { background-color: #f2f2f2; }\n"
    html += "tr:hover { background-color: #f5f5f5; }\n"
    html += ".metric-card { background: #f8f9fa; border-radius: 5px; padding: 15px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }\n"
    html += ".metric-value { font-size: 1.2em; font-weight: bold; color: #3498db; }\n"
    html += ".excellent { color: #27ae60; font-weight: bold; }\n"
    html += ".good { color: #2ecc71; font-weight: bold; }\n"
    html += ".acceptable { color: #f39c12; font-weight: bold; }\n"
    html += ".needs-improvement { color: #e67e22; font-weight: bold; }\n"
    html += ".poor { color: #e74c3c; font-weight: bold; }\n"
    html += ".strength { color: #27ae60; }\n"
    html += ".weakness { color: #e74c3c; }\n"
    html += ".recommendation { color: #3498db; }\n"
    html += "</style>\n"
    html += "</head>\n"
    html += "<body>\n"
    
    html += f"<h1>Test Run Report - {run_parameters['run_date']}</h1>\n"
    html += f"<p><strong>Model Version:</strong> {run_parameters['model_version']}</p>\n"
    html += f"<p><strong>Number of Test Cases:</strong> {len(merged_df)}</p>\n"
    html += f"<p><strong>Accuracy:</strong> {accuracy:.2%}</p>\n"
    
    html += "<h2>Model Quality Assessment</h2>\n"
    html += "<div class='metric-card'>\n"
    
    quality_class = assessment['quality'].lower().replace(' ', '-')
    html += f"<h3>Overall Quality: <span class='{quality_class}'>{assessment['quality']}</span></h3>\n"
    html += f"<p><strong>Production Readiness:</strong> {assessment['readiness']}</p>\n"
    html += f"<p><strong>Assessment Confidence:</strong> {assessment['confidence']}</p>\n"
    
    if assessment['strengths']:
        html += "<h4>Strengths:</h4>\n"
        html += "<ul>\n"
        for strength in assessment['strengths']:
            html += f"<li class='strength'>{strength}</li>\n"
        html += "</ul>\n"
    
    if assessment['weaknesses']:
        html += "<h4>Areas for Improvement:</h4>\n"
        html += "<ul>\n"
        for weakness in assessment['weaknesses']:
            html += f"<li class='weakness'>{weakness}</li>\n"
        html += "</ul>\n"
    
    html += "<h4>Recommendations:</h4>\n"
    html += "<ul>\n"
    for recommendation in assessment['recommendations']:
        html += f"<li class='recommendation'>{recommendation}</li>\n"
    html += "</ul>\n"
    
    html += "</div>\n"
    
    html += "<h2>Additional Metrics and Visualizations</h2>\n"
    
    html += "<div class='metric-card'>\n"
    html += "<h3>Classification Report</h3>\n"
    html += "<table>\n"
    html += "<tr><th>Label</th><th>Precision</th><th>Recall</th><th>F1-Score</th><th>Support</th></tr>\n"
    
    lines = report.split('\n')
    for line in lines[2:-5]:
        if line.strip():
            parts = line.split()
            if len(parts) >= 5:
                label = parts[0]
                precision = float(parts[1])
                recall = float(parts[2])
                f1 = float(parts[3])
                support = int(parts[4])
                html += f"<tr><td>{label}</td><td>{precision:.2f}</td><td>{recall:.2f}</td>"
                html += f"<td>{f1:.2f}</td><td>{support}</td></tr>\n"
    
    html += "</table>\n"
    html += "</div>\n"
    
    html += "<div class='metric-card'>\n"
    html += "<h3>Confusion Matrix</h3>\n"
    html += "<table>\n"
    
    html += "<tr><th></th>"
    for label in labels:
        html += f"<th>Predicted {label}</th>"
    html += "</tr>\n"
    
    for i, true_label in enumerate(labels):
        html += f"<tr><td>Actual {true_label}</td>"
        for j in range(len(labels)):
            html += f"<td>{cm[i][j]}</td>"
        html += "</tr>\n"
    
    html += "</table>\n"
    html += "</div>\n"
    
    html += "<div class='metric-card'>\n"
    html += "<h3>Advanced Evaluation Metrics</h3>\n"
    html += "<ul>\n"
    html += f"<li><strong>Matthews Correlation Coefficient:</strong> <span class='metric-value'>{mcc:.2f}</span></li>\n"
    html += f"<li><strong>Cohen's Kappa:</strong> <span class='metric-value'>{kappa:.2f}</span></li>\n"
    html += f"<li><strong>Balanced Accuracy:</strong> <span class='metric-value'>{accuracy:.2f}</span></li>\n"
    html += "</ul>\n"
    html += "</div>\n"
    
    html += "<div class='metric-card'>\n"
    html += "<h3>Confidence Score Distribution</h3>\n"
    html += "<ul>\n"
    for label, stats in conf_stats.items():
        html += f"<li><strong>{label}:</strong> Mean confidence = <span class='metric-value'>{stats['mean']:.2f}</span> (σ = {stats['std']:.2f})</li>\n"
        html += f"<ul><li>Range: {stats['min']:.2f} - {stats['max']:.2f}</li></ul>\n"
    html += "</ul>\n"
    
    high_conf_pct = (merged_df['confidence'] > 0.9).mean() * 100
    html += f"<p>{high_conf_pct:.1f}% of predictions have confidence scores above 0.90</p>\n"
    html += "</div>\n"
        
    html += "</body>\n"
    html += "</html>"
    
    return html

report_html = generate_detailed_report()
print("Generated HTML report with detailed metrics and senior assessment.")

Generated HTML report with detailed metrics and senior assessment.


In [26]:
# Save the report as an HTML file
report_filename = f"test_run_report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
with open(report_filename, 'w') as f:
    f.write(report_html)

print(f"Automated report saved as {report_filename}")

Automated report saved as test_run_report_20250412_225918.html


## Conclusion

The automated reporting system serves as the final component in the SWIFT message testing framework. Generates records of model performance so it enables comparison across versions to track improvements and detect regressions.