In [None]:
# notebooks/week12_evaluation.ipynb

"""
# Week 12: Evaluating & Hardening LLM Apps
## Production-Ready Healthcare Assistant

### Learning Objectives
1. Implement comprehensive RAG evaluation
2. Detect and prevent hallucinations
3. Run safety red-team tests
4. Set up regression testing
5. Add production monitoring
6. Complete the capstone project

### Setup
"""

# Cell 1: Setup
import os
import sys
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

load_dotenv()

print("OpenAI API Key:", "‚úÖ" if os.getenv("OPENAI_API_KEY") else "‚ùå")


# Cell 2: Load RAG System
"""
## Part 1: Setup RAG for Evaluation
"""

from src.llm.rag import VectorStoreManager, get_vector_store
from src.llm.rag.chains import RAGChain

# Load existing vector store
vector_store = get_vector_store()

try:
    vector_store.load("healthcare_policies")
    print("‚úÖ Vector store loaded")
except:
    print("‚ö†Ô∏è Vector store not found - creating new one")
    from src.llm.rag import load_policy_documents
    docs = load_policy_documents("data/documents")
    vector_store.create_from_documents(docs)
    vector_store.save("healthcare_policies")

# Create RAG chain
rag_chain = RAGChain(vector_store)
print("‚úÖ RAG chain ready")


# Cell 3: RAG Quality Evaluation with Ragas
"""
## Part 2: RAG Quality Metrics
"""

from src.llm.evaluation import RagasEvaluator
from src.llm.rag.evaluation import create_healthcare_golden_set

# Create golden set
golden = create_healthcare_golden_set()
print(f"Golden set has {len(golden.questions)} questions")

# Create evaluator
evaluator = RagasEvaluator(
    thresholds={
        "faithfulness": 0.7,
        "answer_relevancy": 0.7,
        "context_precision": 0.6
    }
)

# Get questions
questions, ground_truths = golden.to_eval_format()

# Run on subset for demo
print("\nEvaluating RAG quality...")
evaluator.add_samples_from_chain(rag_chain, questions[:5], ground_truths[:5])
results = evaluator.evaluate()

print("\n" + "=" * 50)
print("RAG QUALITY RESULTS")
print("=" * 50)
print(f"Evaluator: {results['evaluator']}")
print(f"Pass Rate: {results['pass_rate']:.1%}")
print("\nMetric Summary:")
for metric, values in results.get('summary', {}).items():
    print(f"  {metric}: {values['mean']:.3f} (min: {values['min']:.3f}, max: {values['max']:.3f})")


# Cell 4: Hallucination Detection
"""
## Part 3: Hallucination Detection
"""

from src.llm.evaluation import HallucinationDetector

detector = HallucinationDetector()

# Test with a sample
test_question = "What are the consequences of missing appointments?"
result = rag_chain.ask(test_question, return_sources=True)

detection = detector.detect(
    question=test_question,
    answer=result["answer"],
    contexts=[s["content"] for s in result.get("sources", [])]
)

print("\n" + "=" * 50)
print("HALLUCINATION DETECTION")
print("=" * 50)
print(f"Question: {test_question}")
print(f"\nAnswer: {result['answer'][:300]}...")
print(f"\nHas Hallucination: {detection.has_hallucination}")
print(f"Confidence: {detection.confidence:.2f}")
print(f"Claims Checked: {detection.claims_checked}")
print(f"Claims Verified: {detection.claims_verified}")

if detection.issues:
    print("\nIssues Found:")
    for issue in detection.issues:
        print(f"  - {issue['claim'][:50]}...")
        print(f"    Status: {issue['status']}")


# Cell 5: Batch Hallucination Check
"""
### Batch Hallucination Check
"""

# Test multiple responses
test_questions = [
    "What is the cancellation policy?",
    "When are reminders sent?",
    "What happens after 3 no-shows?",
    "Can fees be waived?"
]

hallucination_results = []

for question in test_questions:
    result = rag_chain.ask(question, return_sources=True)
    detection = detector.detect(
        question=question,
        answer=result["answer"],
        contexts=[s["content"] for s in result.get("sources", [])]
    )
    hallucination_results.append(detection)

# Summary
summary = detector.get_summary(hallucination_results)
print("\nBatch Hallucination Summary:")
print(f"  Total Samples: {summary['total_samples']}")
print(f"  With Hallucination: {summary['samples_with_hallucination']}")
print(f"  Hallucination Rate: {summary['hallucination_rate']:.1%}")
print(f"  Claim Support Rate: {summary['claim_support_rate']:.1%}")


# Cell 6: Safety Testing
"""
## Part 4: Safety Red-Team Testing
"""

from src.llm.evaluation import SafetyEvaluator

safety_eval = SafetyEvaluator()
safety_eval.load_default_tests()

print(f"Loaded {len(safety_eval.tests)} safety tests")
print("\nTest Categories:")
categories = set(t.category.value for t in safety_eval.tests)
for cat in categories:
    count = sum(1 for t in safety_eval.tests if t.category.value == cat)
    print(f"  - {cat}: {count} tests")


# Cell 7: Run Safety Tests
"""
### Run Safety Tests
"""

print("\nRunning safety tests...")
safety_results = safety_eval.run_tests(rag_chain)

print("\n" + "=" * 50)
print("SAFETY TEST RESULTS")
print("=" * 50)
print(f"Total Tests: {safety_results['total_tests']}")
print(f"Passed: {safety_results['passed']}")
print(f"Failed: {safety_results['failed']}")
print(f"Pass Rate: {safety_results['pass_rate']:.1%}")

print("\nBy Category:")
for cat, data in safety_results['by_category'].items():
    status = "‚úÖ" if data['failed'] == 0 else "‚ùå"
    print(f"  {status} {cat}: {data['passed']}/{data['passed'] + data['failed']} passed")

if safety_results['failed_tests']:
    print("\nFailed Tests:")
    for test in safety_results['failed_tests'][:5]:
        print(f"  ‚ùå {test['name']} ({test['severity']})")
        for reason in test['reasons'][:2]:
            print(f"     - {reason}")


# Cell 8: Get Security Recommendations
"""
### Security Recommendations
"""

recommendations = safety_eval.get_recommendations()

print("\nSecurity Recommendations:")
for i, rec in enumerate(recommendations, 1):
    print(f"\n{i}. {rec}")


# Cell 9: Custom Metrics
"""
## Part 5: Custom Metrics
"""

from src.llm.evaluation import EvaluationMetrics

metrics = EvaluationMetrics()

# Test response
question = "How do I reschedule my appointment?"
result = rag_chain.ask(question)
answer = result["answer"]

# Calculate all metrics
all_metrics = metrics.calculate_all(
    question=question,
    answer=answer,
    contexts=[]
)

print("\n" + "=" * 50)
print("CUSTOM METRICS")
print("=" * 50)
print(f"Question: {question}")
print(f"Answer: {answer[:200]}...")

print("\nMetric Results:")
for name, result in all_metrics.items():
    status = "‚úÖ" if result.passed else "‚ùå"
    print(f"  {status} {name}: {result.score:.3f} (threshold: {result.threshold})")

summary = metrics.get_summary(all_metrics)
print(f"\nOverall Pass Rate: {summary['overall_pass_rate']:.1%}")


# Cell 10: Regression Testing
"""
## Part 6: Regression Testing
"""

from src.llm.evaluation import RegressionTestSuite

# Create a baseline
baseline_questions = [
    "What is the no-show policy?",
    "How many reminders do patients receive?",
    "What are the consequences of missing appointments?",
    "Can appointment fees be waived?",
    "How do I cancel an appointment?"
]

suite = RegressionTestSuite(tolerance=0.05)

# Create baseline from current system
print("Creating baseline...")
baseline = suite.create_baseline(rag_chain, baseline_questions)

print(f"Baseline created with {len(baseline['tests'])} tests")

# Save baseline
baseline_path = project_root / "evals" / "baselines" / "demo_baseline.json"
suite.save_baseline(baseline, str(baseline_path))
print(f"Saved to: {baseline_path}")


# Cell 11: Run Regression Tests
"""
### Run Regression Tests

Simulating testing against baseline (should pass since it's the same system)
"""

# Load and run
suite2 = RegressionTestSuite()
suite2.load_baseline(str(baseline_path))

print("Running regression tests...")
regression_results = suite2.run(rag_chain)

print("\n" + "=" * 50)
print("REGRESSION TEST RESULTS")
print("=" * 50)
print(f"Passed: {regression_results['passed']}")
print(f"Tests: {regression_results['tests_passed']}/{regression_results['total_tests']}")
print(f"Average Similarity: {regression_results['similarity_score']:.2f}")

if regression_results['failed_tests']:
    print("\nFailed Tests:")
    for test in regression_results['failed_tests']:
        print(f"  ‚ùå {test['question'][:40]}...")
        print(f"     Similarity: {test['similarity']:.2f}")


# Cell 12: Full Evaluation Framework
"""
## Part 7: Complete Evaluation Framework
"""

from src.llm.evaluation import EvaluationFramework, EvaluationConfig
from src.llm.evaluation.framework import EvaluationType

# Configure evaluation
config = EvaluationConfig(
    evaluation_types=[
        EvaluationType.RAG_QUALITY,
        EvaluationType.HALLUCINATION,
        EvaluationType.SAFETY,
        EvaluationType.PERFORMANCE
    ],
    rag_thresholds={
        "faithfulness": 0.6,
        "answer_relevancy": 0.6,
        "context_precision": 0.5
    },
    hallucination_threshold=0.3,
    safety_pass_rate=0.8,
    max_latency_ms=10000
)

# Create framework
framework = EvaluationFramework(config)
framework.register_rag_chain(rag_chain)
framework._golden_set = golden

print("Running full evaluation suite...")
report = framework.run_full_evaluation()

print("\n" + "=" * 60)
print("FULL EVALUATION REPORT")
print("=" * 60)
print(report.to_markdown()[:2000])


# Cell 13: Production Monitoring
"""
## Part 8: Production Monitoring
"""

from src.llm.production.monitoring import LLMMonitor, MetricsCollector

# Create monitor
monitor = LLMMonitor()

# Simulate some requests
import random
import time

print("Simulating production traffic...")

for i in range(20):
    with monitor.track_request("gpt-4o-mini") as tracker:
        # Simulate request
        time.sleep(random.uniform(0.1, 0.5))
        tracker.set_tokens(random.randint(100, 500))
    
    # Simulate cache
    monitor.record_cache(hit=random.random() > 0.5)

# Get stats
stats = monitor.get_stats(window_minutes=5)

print("\n" + "=" * 50)
print("PRODUCTION METRICS")
print("=" * 50)
print(f"Requests: {stats['requests']['total']}")
print(f"Avg Latency: {stats['latency_ms']['avg']:.0f}ms")
print(f"P95 Latency: {stats['latency_ms']['p95']:.0f}ms")
print(f"Error Rate: {stats['errors']['rate']:.1%}")
print(f"Cache Hit Rate: {stats['cache']['hit_rate']:.1%}")

# Get health
health = monitor.get_health()
print(f"\nHealth Status: {health['status']}")


# Cell 14: Error Handling Demo
"""
## Part 9: Error Handling
"""

from src.llm.production.error_handling import (
    safe_llm_call,
    CircuitBreaker,
    ErrorHandler
)

# Demo safe_llm_call decorator
@safe_llm_call(max_retries=3, fallback="I apologize, I'm having trouble processing that.")
def generate_response(question):
    return rag_chain.ask(question)["answer"]

# Test it
response = generate_response("What is the cancellation policy?")
print(f"Response: {response[:200]}...")


# Cell 15: Circuit Breaker
"""
### Circuit Breaker Pattern
"""

# Demo circuit breaker
breaker = CircuitBreaker(
    failure_threshold=3,
    recovery_timeout=30,
    success_threshold=2
)

print(f"Circuit Breaker State: {breaker.state}")

# Simulate usage
@breaker
def protected_call(question):
    return rag_chain.ask(question)["answer"]

try:
    result = protected_call("Test question")
    print(f"‚úÖ Call succeeded, state: {breaker.state}")
except Exception as e:
    print(f"‚ùå Call failed: {e}")


# Cell 16: Capstone - Complete System Test
"""
## Part 10: Capstone - Complete Healthcare Assistant

Putting it all together: A production-ready healthcare assistant
with evaluation, safety, and monitoring.
"""

print("\n" + "=" * 60)
print("CAPSTONE: HEALTHCARE APPOINTMENT ASSISTANT")
print("=" * 60)

# 1. System Check
print("\nüìã System Check:")
print(f"  ‚úÖ RAG Chain: Ready")
print(f"  ‚úÖ Vector Store: {vector_store.get_stats()['metadata'].get('chunk_count', 0)} chunks")

# 2. Sample Interaction
print("\nüí¨ Sample Interaction:")
test_q = "What happens if I miss my appointment?"
response = rag_chain.ask(test_q, return_sources=True)
print(f"  Q: {test_q}")
print(f"  A: {response['answer'][:200]}...")

# 3. Quality Check
print("\nüìä Quality Metrics:")
metrics_result = metrics.calculate_all(test_q, response['answer'])
passed = sum(1 for r in metrics_result.values() if r.passed)
print(f"  Metrics Passed: {passed}/{len(metrics_result)}")

# 4. Hallucination Check
print("\nüîç Hallucination Check:")
h_result = detector.detect(
    test_q, 
    response['answer'],
    [s['content'] for s in response.get('sources', [])]
)
print(f"  Clean: {'‚úÖ Yes' if not h_result.has_hallucination else '‚ùå No'}")

# 5. Safety Status
print("\nüõ°Ô∏è Safety Status:")
print(f"  Tests Passed: {safety_results['passed']}/{safety_results['total_tests']}")

# 6. System Health
print("\n‚ù§Ô∏è System Health:")
print(f"  Status: {health['status'].upper()}")

print("\n" + "=" * 60)
print("CAPSTONE COMPLETE! üéâ")
print("=" * 60)


# Cell 17: Save Final Report
"""
## Save Final Evaluation Report
"""

# Save comprehensive report
report_dir = project_root / "evals" / "final_report"
report_dir.mkdir(parents=True, exist_ok=True)

# Save evaluation report
report.save(str(report_dir / "evaluation_report.json"))

# Save safety results
import json
with open(report_dir / "safety_results.json", 'w') as f:
    json.dump(safety_results, f, indent=2)

# Save regression baseline
suite.save_baseline(baseline, str(report_dir / "regression_baseline.json"))

print(f"Reports saved to: {report_dir}")
print("\nFiles created:")
for f in report_dir.iterdir():
    print(f"  - {f.name}")


# Cell 18: Course Summary
"""
## Course Complete! üéì

### What You Built

**Month 1: Data Analytics Foundations**
- SQL querying for healthcare data
- Python data analysis with pandas
- Looker Studio dashboards
- EDA and data cleaning

**Month 2: Applied ML & MLOps**
- No-show prediction model
- Scikit-learn pipelines
- FastAPI deployment
- Docker containerization

**Month 3: Generative AI & LLM Applications**

*Week 8-9: Prompt Engineering*
- LLM client wrapper
- Prompt templates for healthcare
- Few-shot and chain-of-thought patterns
- Safety guardrails

*Week 10: LangChain Integration*
- LCEL chains
- Custom tools (ML API integration)
- Conversation memory
- Healthcare agent

*Week 11: RAG Pipeline*
- Document loading and chunking
- FAISS vector store
- Retrieval chains
- Conversational RAG

*Week 12: Evaluation & Production*
- Ragas evaluation
- Hallucination detection
- Safety red-teaming
- Regression testing
- Production monitoring

### Your Healthcare Appointment Assistant Features

1. ‚úÖ No-show risk prediction
2. ‚úÖ Natural language explanations
3. ‚úÖ Intervention recommendations
4. ‚úÖ Policy Q&A (RAG)
5. ‚úÖ Conversational interface
6. ‚úÖ Safety guardrails
7. ‚úÖ Production monitoring
8. ‚úÖ Comprehensive evaluation

### Next Steps

1. Deploy to cloud (AWS/GCP/Azure)
2. Add more policy documents
3. Integrate with EHR system
4. Add user authentication
5. Set up CI/CD for evaluations
6. Monitor in production
"""

print("Congratulations on completing the course! üéâ")