# Session 1.12: Exception Handling

## **Needed for Building Robust PySpark Data Pipelines**

### **Learning Objectives**
By the end of this session, you will:
- Understand Python exception handling mechanisms
- Use try-except blocks for error management
- Handle specific healthcare data processing errors
- Build robust code essential for PySpark pipelines

---

### **Relevance to PySpark**
Exception handling is crucial for building robust PySpark data pipelines that can handle data quality issues, network failures, and processing errors without crashing.

---

## 1. Basic Exception Handling

In [None]:
def safe_division(numerator, denominator):
    """Safely divide two numbers with error handling."""
    try:
        result = numerator / denominator
        return result
    except ZeroDivisionError:
        print(f"Error: Cannot divide {numerator} by zero")
        return None
    except TypeError:
        print(f"Error: Invalid data types for division")
        return None

# Test the function
print(f"10 / 2 = {safe_division(10, 2)}")
print(f"10 / 0 = {safe_division(10, 0)}")
print(f"'10' / 2 = {safe_division('10', 2)}")

# Healthcare example: BMI calculation with error handling
def safe_bmi_calculation(weight, height):
    """Calculate BMI with proper error handling."""
    try:
        if height <= 0:
            raise ValueError("Height must be positive")
        if weight <= 0:
            raise ValueError("Weight must be positive")
        
        bmi = weight / (height ** 2)
        return round(bmi, 2)
    
    except (TypeError, ValueError) as e:
        print(f"BMI calculation error: {e}")
        return None

# Test BMI calculation
print(f"\nBMI Tests:")
print(f"Normal: {safe_bmi_calculation(70, 1.75)}")
print(f"Zero height: {safe_bmi_calculation(70, 0)}")
print(f"Negative weight: {safe_bmi_calculation(-70, 1.75)}")
print(f"String input: {safe_bmi_calculation('70', 1.75)}")

## 2. Handling Data Processing Errors

In [None]:
def process_patient_data(patient_record):
    """Process patient data with comprehensive error handling."""
    try:
        # Validate required fields
        required_fields = ['name', 'age', 'weight', 'height']
        for field in required_fields:
            if field not in patient_record:
                raise KeyError(f"Missing required field: {field}")
        
        # Validate data types and ranges
        name = str(patient_record['name']).strip()
        if not name:
            raise ValueError("Name cannot be empty")
        
        age = int(patient_record['age'])
        if age < 0 or age > 150:
            raise ValueError(f"Invalid age: {age}")
        
        weight = float(patient_record['weight'])
        if weight <= 0 or weight > 500:
            raise ValueError(f"Invalid weight: {weight}")
        
        height = float(patient_record['height'])
        if height <= 0 or height > 3:
            raise ValueError(f"Invalid height: {height}")
        
        # Calculate BMI
        bmi = weight / (height ** 2)
        
        # Return processed data
        return {
            'name': name.title(),
            'age': age,
            'weight': weight,
            'height': height,
            'bmi': round(bmi, 2),
            'status': 'processed'
        }
    
    except KeyError as e:
        return {'error': f"Missing data: {e}", 'status': 'error'}
    except ValueError as e:
        return {'error': f"Invalid data: {e}", 'status': 'error'}
    except Exception as e:
        return {'error': f"Unexpected error: {e}", 'status': 'error'}

# Test with various patient records
test_patients = [
    {'name': 'John Doe', 'age': 45, 'weight': 75.5, 'height': 1.75},
    {'name': '', 'age': 32, 'weight': 62.3, 'height': 1.65},  # Empty name
    {'name': 'Bob Johnson', 'age': -5, 'weight': 88.2, 'height': 1.80},  # Invalid age
    {'name': 'Alice Brown', 'weight': 55.8, 'height': 1.62},  # Missing age
    {'name': 'Charlie Wilson', 'age': '45', 'weight': '75.5', 'height': '1.75'},  # String numbers
]

print("Patient Data Processing Results:")
print("=" * 50)
for i, patient in enumerate(test_patients, 1):
    result = process_patient_data(patient)
    print(f"Patient {i}: {result}")
    print()

## 3. File and Data Reading Error Handling

In [None]:
import json
import csv
from io import StringIO

def read_patient_json(json_string):
    """Read patient data from JSON with error handling."""
    try:
        data = json.loads(json_string)
        return {'data': data, 'status': 'success'}
    except json.JSONDecodeError as e:
        return {'error': f"Invalid JSON format: {e}", 'status': 'error'}
    except Exception as e:
        return {'error': f"Unexpected error reading JSON: {e}", 'status': 'error'}

def read_patient_csv(csv_string):
    """Read patient data from CSV with error handling."""
    try:
        csv_file = StringIO(csv_string)
        reader = csv.DictReader(csv_file)
        patients = []
        
        for row_num, row in enumerate(reader, 1):
            try:
                # Validate and convert data types
                patient = {
                    'id': row['id'],
                    'name': row['name'],
                    'age': int(row['age']),
                    'weight': float(row['weight']),
                    'height': float(row['height'])
                }
                patients.append(patient)
            except (ValueError, KeyError) as e:
                print(f"Warning: Error in row {row_num}: {e}")
                continue
        
        return {'data': patients, 'status': 'success'}
    
    except Exception as e:
        return {'error': f"Error reading CSV: {e}", 'status': 'error'}

# Test JSON reading
valid_json = '{"id": "PT001", "name": "John Doe", "age": 45}'
invalid_json = '{"id": "PT001", "name": "John Doe", "age": 45'

print("JSON Reading Tests:")
print(f"Valid JSON: {read_patient_json(valid_json)}")
print(f"Invalid JSON: {read_patient_json(invalid_json)}")

# Test CSV reading
csv_data = """id,name,age,weight,height
PT001,John Doe,45,75.5,1.75
PT002,Jane Smith,invalid_age,62.3,1.65
PT003,Bob Johnson,58,88.2,1.80"""

print("\nCSV Reading Test:")
result = read_patient_csv(csv_data)
print(f"Result: {result}")

## 4. Custom Exceptions for Healthcare Domain

In [None]:
# Custom exception classes for healthcare domain
class HealthcareDataError(Exception):
    """Base exception for healthcare data errors."""
    pass

class PatientValidationError(HealthcareDataError):
    """Exception for patient data validation errors."""
    pass

class MedicalCalculationError(HealthcareDataError):
    """Exception for medical calculation errors."""
    pass

class ClinicalDataError(HealthcareDataError):
    """Exception for clinical data processing errors."""
    pass

def validate_patient_vitals(vitals):
    """Validate patient vital signs with custom exceptions."""
    try:
        # Heart rate validation
        heart_rate = vitals.get('heart_rate')
        if heart_rate is None:
            raise PatientValidationError("Heart rate is required")
        if not 30 <= heart_rate <= 200:
            raise PatientValidationError(f"Heart rate {heart_rate} is outside normal range (30-200)")
        
        # Blood pressure validation
        bp = vitals.get('blood_pressure')
        if bp is None:
            raise PatientValidationError("Blood pressure is required")
        
        if isinstance(bp, str):
            try:
                systolic, diastolic = map(int, bp.split('/'))
            except ValueError:
                raise PatientValidationError(f"Invalid blood pressure format: {bp}")
        else:
            raise PatientValidationError("Blood pressure must be in 'systolic/diastolic' format")
        
        if not (60 <= systolic <= 300) or not (30 <= diastolic <= 200):
            raise PatientValidationError(f"Blood pressure {bp} is outside normal range")
        
        # Temperature validation
        temp = vitals.get('temperature')
        if temp is not None and not 95.0 <= temp <= 110.0:
            raise PatientValidationError(f"Temperature {temp}°F is outside normal range (95-110)")
        
        return {'status': 'valid', 'message': 'All vitals are within normal ranges'}
    
    except PatientValidationError as e:
        return {'status': 'invalid', 'error': str(e)}
    except Exception as e:
        return {'status': 'error', 'error': f"Unexpected error: {e}"}

# Test custom exceptions
test_vitals = [
    {'heart_rate': 72, 'blood_pressure': '120/80', 'temperature': 98.6},
    {'heart_rate': 300, 'blood_pressure': '120/80', 'temperature': 98.6},  # High HR
    {'heart_rate': 72, 'blood_pressure': 'invalid', 'temperature': 98.6},  # Invalid BP
    {'blood_pressure': '120/80', 'temperature': 98.6},  # Missing HR
    {'heart_rate': 72, 'blood_pressure': '120/80', 'temperature': 115.0},  # High temp
]

print("Vital Signs Validation Tests:")
print("=" * 40)
for i, vitals in enumerate(test_vitals, 1):
    result = validate_patient_vitals(vitals)
    print(f"Test {i}: {vitals}")
    print(f"Result: {result}")
    print()

## 5. Exception Handling in Data Processing Pipelines

In [None]:
def robust_data_pipeline(patient_records):
    """Process multiple patient records with comprehensive error handling."""
    processed_data = []
    error_log = []
    success_count = 0
    error_count = 0
    
    for i, record in enumerate(patient_records):
        try:
            # Step 1: Validate basic structure
            if not isinstance(record, dict):
                raise ClinicalDataError(f"Record {i+1}: Expected dictionary, got {type(record)}")
            
            # Step 2: Process patient data
            processed_patient = process_patient_data(record)
            
            if processed_patient.get('status') == 'error':
                raise ClinicalDataError(f"Record {i+1}: {processed_patient.get('error')}")
            
            # Step 3: Validate vitals if present
            if 'vitals' in record:
                vitals_result = validate_patient_vitals(record['vitals'])
                if vitals_result['status'] != 'valid':
                    # Don't fail, but add warning
                    processed_patient['vitals_warning'] = vitals_result.get('error')
            
            # Step 4: Add to processed data
            processed_data.append(processed_patient)
            success_count += 1
            
        except (ClinicalDataError, PatientValidationError, MedicalCalculationError) as e:
            error_log.append({'record_index': i+1, 'error': str(e), 'type': type(e).__name__})
            error_count += 1
            
        except Exception as e:
            error_log.append({'record_index': i+1, 'error': f"Unexpected error: {e}", 'type': 'UnexpectedError'})
            error_count += 1
    
    # Return comprehensive results
    return {
        'processed_data': processed_data,
        'error_log': error_log,
        'summary': {
            'total_records': len(patient_records),
            'successful': success_count,
            'errors': error_count,
            'success_rate': round(success_count / len(patient_records) * 100, 2) if patient_records else 0
        }
    }

# Test the robust pipeline
test_records = [
    {'name': 'John Doe', 'age': 45, 'weight': 75.5, 'height': 1.75},
    {'name': 'Jane Smith', 'age': 32, 'weight': 62.3, 'height': 1.65, 
     'vitals': {'heart_rate': 72, 'blood_pressure': '120/80'}},
    {'name': 'Invalid Patient', 'age': -5, 'weight': 88.2, 'height': 1.80},
    "not_a_dict",  # Invalid record type
    {'name': 'Bob Johnson', 'age': 58, 'weight': 88.2, 'height': 1.80,
     'vitals': {'heart_rate': 300, 'blood_pressure': '120/80'}},  # Invalid vitals
]

pipeline_result = robust_data_pipeline(test_records)

print("Data Pipeline Results:")
print("=" * 30)
print(f"Summary: {pipeline_result['summary']}")
print(f"\nProcessed Data ({len(pipeline_result['processed_data'])} records):")
for data in pipeline_result['processed_data']:
    print(f"  {data}")

print(f"\nError Log ({len(pipeline_result['error_log'])} errors):")
for error in pipeline_result['error_log']:
    print(f"  {error}")

## 6. Practice Exercise

Build a robust clinical data processor with comprehensive error handling.

In [None]:
# Exercise: Clinical lab results processing with error handling
lab_results = [
    {'patient_id': 'PT001', 'test': 'Glucose', 'value': 95, 'unit': 'mg/dL', 'ref_range': '70-100'},
    {'patient_id': 'PT002', 'test': 'Cholesterol', 'value': 'invalid', 'unit': 'mg/dL', 'ref_range': '<200'},
    {'patient_id': 'PT003', 'test': 'Blood Pressure', 'value': '140/90', 'unit': 'mmHg'},  # Missing ref_range
    {'patient_id': '', 'test': 'Hemoglobin', 'value': 14.2, 'unit': 'g/dL', 'ref_range': '12-16'},
    {'patient_id': 'PT005', 'test': 'Temperature', 'value': 105.0, 'unit': 'F', 'ref_range': '97-99'},
]

# TODO: Create a function that:
# 1. Validates each lab result record
# 2. Handles missing or invalid data
# 3. Flags abnormal results based on reference ranges
# 4. Returns processed results with error logging
# 5. Calculates processing statistics

# Your code here

---

## Summary

In this session, you learned:
- ✅ Basic exception handling with try-except blocks
- ✅ Handling specific data processing errors
- ✅ File and data reading error management
- ✅ Creating custom exceptions for healthcare domain
- ✅ Building robust data processing pipelines
- ✅ Error logging and reporting
- ✅ Essential concepts for robust PySpark pipelines

**Next:** Session 1.13 - Context Managers and File I/O