# Session 1.13: Context Managers and File I/O

## **Essential for Resource Management in PySpark**

### **Learning Objectives**
By the end of this session, you will:
- Understand context managers and the `with` statement
- Handle file operations safely in healthcare data processing
- Manage resources properly for PySpark applications
- Build custom context managers for healthcare workflows

---

### **Relevance to PySpark**
Context managers ensure proper resource cleanup in PySpark applications, preventing memory leaks and connection issues when working with large datasets and external systems.

---

## 1. Basic File Operations with Context Managers

In [None]:
import tempfile
import os
import json
import csv
from contextlib import contextmanager

# Create temporary files for demonstration
temp_dir = tempfile.mkdtemp()
print(f"Using temporary directory: {temp_dir}")

# Healthcare data for examples
patient_data = [
    {'id': 'PT001', 'name': 'John Doe', 'age': 45, 'diagnosis': 'Hypertension'},
    {'id': 'PT002', 'name': 'Jane Smith', 'age': 32, 'diagnosis': 'Diabetes'},
    {'id': 'PT003', 'name': 'Bob Johnson', 'age': 58, 'diagnosis': 'Asthma'}
]

# Write patient data to JSON file using context manager
json_file_path = os.path.join(temp_dir, 'patients.json')

# Safe file writing with automatic resource cleanup
with open(json_file_path, 'w') as file:
    json.dump(patient_data, file, indent=2)
    print(f"Patient data written to {json_file_path}")

# Read the file back
with open(json_file_path, 'r') as file:
    loaded_data = json.load(file)
    print(f"\nLoaded {len(loaded_data)} patient records:")
    for patient in loaded_data:
        print(f"  {patient['id']}: {patient['name']}, {patient['age']} years old")

# Write CSV file
csv_file_path = os.path.join(temp_dir, 'patients.csv')

with open(csv_file_path, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['id', 'name', 'age', 'diagnosis'])
    writer.writeheader()
    writer.writerows(patient_data)
    print(f"\nCSV data written to {csv_file_path}")

# Read CSV file
with open(csv_file_path, 'r') as file:
    reader = csv.DictReader(file)
    print("\nCSV data read back:")
    for row in reader:
        print(f"  {row}")

## 2. Error-Safe File Operations

In [None]:
def safe_file_read(file_path, file_type='text'):
    """Safely read files with proper error handling and resource management."""
    try:
        if file_type == 'json':
            with open(file_path, 'r') as file:
                return {'data': json.load(file), 'status': 'success'}
        
        elif file_type == 'csv':
            with open(file_path, 'r') as file:
                reader = csv.DictReader(file)
                return {'data': list(reader), 'status': 'success'}
        
        else:  # text file
            with open(file_path, 'r') as file:
                return {'data': file.read(), 'status': 'success'}
    
    except FileNotFoundError:
        return {'error': f"File not found: {file_path}", 'status': 'error'}
    except json.JSONDecodeError as e:
        return {'error': f"Invalid JSON format: {e}", 'status': 'error'}
    except PermissionError:
        return {'error': f"Permission denied: {file_path}", 'status': 'error'}
    except Exception as e:
        return {'error': f"Unexpected error: {e}", 'status': 'error'}

def safe_file_write(file_path, data, file_type='json'):
    """Safely write files with proper error handling and resource management."""
    try:
        if file_type == 'json':
            with open(file_path, 'w') as file:
                json.dump(data, file, indent=2)
        
        elif file_type == 'csv':
            if not data:
                return {'error': 'No data to write', 'status': 'error'}
            
            with open(file_path, 'w', newline='') as file:
                fieldnames = data[0].keys() if isinstance(data[0], dict) else data[0]
                writer = csv.DictWriter(file, fieldnames=fieldnames)
                writer.writeheader()
                if isinstance(data[0], dict):
                    writer.writerows(data)
        
        else:  # text file
            with open(file_path, 'w') as file:
                file.write(str(data))
        
        return {'message': f"Data written to {file_path}", 'status': 'success'}
    
    except PermissionError:
        return {'error': f"Permission denied: {file_path}", 'status': 'error'}
    except Exception as e:
        return {'error': f"Error writing file: {e}", 'status': 'error'}

# Test safe file operations
print("Testing Safe File Operations:")
print("=" * 40)

# Test reading existing file
result = safe_file_read(json_file_path, 'json')
print(f"Reading existing JSON: {result['status']}")
print(f"Records loaded: {len(result.get('data', []))}")

# Test reading non-existent file
result = safe_file_read('/nonexistent/file.json', 'json')
print(f"\nReading non-existent file: {result['status']}")
print(f"Error: {result.get('error')}")

# Test writing to new file
new_data = [{'id': 'PT004', 'name': 'Alice Brown', 'age': 28, 'diagnosis': 'Migraine'}]
new_file_path = os.path.join(temp_dir, 'new_patients.json')
result = safe_file_write(new_file_path, new_data, 'json')
print(f"\nWriting new file: {result['status']}")
print(f"Message: {result.get('message')}")

## 3. Custom Context Managers for Healthcare Workflows

In [None]:
import time
from datetime import datetime

@contextmanager
def healthcare_audit_log(operation_name, patient_id=None):
    """Context manager for auditing healthcare data operations."""
    start_time = datetime.now()
    print(f"[AUDIT] Starting {operation_name}" + 
          (f" for patient {patient_id}" if patient_id else "") + 
          f" at {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    try:
        yield  # Execute the code block
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        print(f"[AUDIT] Completed {operation_name} successfully in {duration:.2f}s")
    
    except Exception as e:
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        print(f"[AUDIT] Failed {operation_name} after {duration:.2f}s: {e}")
        raise  # Re-raise the exception

@contextmanager
def patient_data_session(patient_id):
    """Context manager for patient data processing sessions."""
    print(f"\n{'='*50}")
    print(f"Opening patient data session for: {patient_id}")
    print(f"{'='*50}")
    
    session_data = {
        'patient_id': patient_id,
        'start_time': datetime.now(),
        'operations': []
    }
    
    try:
        yield session_data
    
    finally:
        end_time = datetime.now()
        duration = (end_time - session_data['start_time']).total_seconds()
        print(f"\nSession Summary:")
        print(f"Patient ID: {patient_id}")
        print(f"Duration: {duration:.2f} seconds")
        print(f"Operations performed: {len(session_data['operations'])}")
        for op in session_data['operations']:
            print(f"  - {op}")
        print(f"{'='*50}\n")

class HealthcareDataProcessor:
    """Healthcare data processor with context manager support."""
    
    def __init__(self, data_directory):
        self.data_directory = data_directory
        self.active_connections = []
    
    def __enter__(self):
        print(f"Initializing Healthcare Data Processor")
        print(f"Data directory: {self.data_directory}")
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        print(f"Cleaning up Healthcare Data Processor")
        # Clean up any active connections
        for conn in self.active_connections:
            print(f"  Closing connection: {conn}")
        self.active_connections.clear()
        
        if exc_type:
            print(f"  Exception occurred: {exc_type.__name__}: {exc_val}")
        return False  # Don't suppress exceptions
    
    def process_patient_file(self, file_path):
        """Process a patient data file."""
        connection_id = f"conn_{len(self.active_connections) + 1}"
        self.active_connections.append(connection_id)
        
        result = safe_file_read(file_path, 'json')
        if result['status'] == 'success':
            return result['data']
        else:
            raise Exception(result['error'])

# Demonstrate custom context managers
print("Testing Custom Context Managers:")
print("=" * 50)

# Using audit log context manager
with healthcare_audit_log("Patient BMI Calculation", "PT001"):
    # Simulate some processing
    weight = 75.5
    height = 1.75
    bmi = weight / (height ** 2)
    time.sleep(0.1)  # Simulate processing time
    print(f"  Calculated BMI: {bmi:.2f}")

# Using patient data session
with patient_data_session("PT002") as session:
    session['operations'].append("Load patient demographics")
    time.sleep(0.05)
    
    session['operations'].append("Calculate vital statistics")
    time.sleep(0.05)
    
    session['operations'].append("Generate health report")
    time.sleep(0.05)

# Using healthcare data processor
with HealthcareDataProcessor(temp_dir) as processor:
    try:
        data = processor.process_patient_file(json_file_path)
        print(f"Processed {len(data)} patient records")
    except Exception as e:
        print(f"Error processing file: {e}")

## 4. File Processing Pipelines with Context Managers

In [None]:
@contextmanager
def batch_file_processor(input_dir, output_dir, operation_name="Batch Processing"):
    """Context manager for batch file processing operations."""
    print(f"Starting {operation_name}")
    print(f"Input directory: {input_dir}")
    print(f"Output directory: {output_dir}")
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    stats = {
        'files_processed': 0,
        'files_failed': 0,
        'start_time': datetime.now()
    }
    
    try:
        yield stats
    
    finally:
        end_time = datetime.now()
        duration = (end_time - stats['start_time']).total_seconds()
        print(f"\nBatch Processing Summary:")
        print(f"Operation: {operation_name}")
        print(f"Duration: {duration:.2f} seconds")
        print(f"Files processed: {stats['files_processed']}")
        print(f"Files failed: {stats['files_failed']}")
        success_rate = (stats['files_processed'] / 
                       (stats['files_processed'] + stats['files_failed']) * 100 
                       if (stats['files_processed'] + stats['files_failed']) > 0 else 0)
        print(f"Success rate: {success_rate:.1f}%")

def process_clinical_data_files():
    """Process multiple clinical data files using context managers."""
    
    # Create multiple sample files
    input_dir = os.path.join(temp_dir, 'input')
    output_dir = os.path.join(temp_dir, 'output')
    os.makedirs(input_dir, exist_ok=True)
    
    # Create sample patient files
    sample_files = {
        'cardiology_patients.json': [
            {'id': 'CD001', 'name': 'Heart Patient 1', 'condition': 'Arrhythmia'},
            {'id': 'CD002', 'name': 'Heart Patient 2', 'condition': 'Hypertension'}
        ],
        'diabetes_patients.json': [
            {'id': 'DB001', 'name': 'Diabetes Patient 1', 'hba1c': 7.2},
            {'id': 'DB002', 'name': 'Diabetes Patient 2', 'hba1c': 6.8}
        ],
        'invalid_file.json': "invalid json content"
    }
    
    # Write sample files
    for filename, content in sample_files.items():
        file_path = os.path.join(input_dir, filename)
        if filename == 'invalid_file.json':
            with open(file_path, 'w') as f:
                f.write(content)  # Write invalid JSON
        else:
            with open(file_path, 'w') as f:
                json.dump(content, f, indent=2)
    
    # Process files using batch processor context manager
    with batch_file_processor(input_dir, output_dir, "Clinical Data Transformation") as stats:
        
        # Get all JSON files in input directory
        json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')]
        
        for filename in json_files:
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, f"processed_{filename}")
            
            with healthcare_audit_log(f"Processing {filename}"):
                try:
                    # Read file
                    result = safe_file_read(input_path, 'json')
                    if result['status'] != 'success':
                        raise Exception(result['error'])
                    
                    data = result['data']
                    
                    # Transform data (add processing timestamp)
                    if isinstance(data, list):
                        for record in data:
                            if isinstance(record, dict):
                                record['processed_at'] = datetime.now().isoformat()
                                record['file_source'] = filename
                    
                    # Write processed file
                    write_result = safe_file_write(output_path, data, 'json')
                    if write_result['status'] != 'success':
                        raise Exception(write_result['error'])
                    
                    stats['files_processed'] += 1
                    print(f"  Successfully processed {filename}")
                
                except Exception as e:
                    stats['files_failed'] += 1
                    print(f"  Failed to process {filename}: {e}")

# Run the clinical data processing pipeline
process_clinical_data_files()

## 5. Resource Management for Database-like Operations

In [None]:
class HealthcareDatabase:
    """Simulated healthcare database with context manager support."""
    
    def __init__(self, db_name):
        self.db_name = db_name
        self.connection = None
        self.transaction = None
        self.is_connected = False
    
    def connect(self):
        """Simulate database connection."""
        print(f"Connecting to healthcare database: {self.db_name}")
        self.connection = f"connection_to_{self.db_name}"
        self.is_connected = True
        return self.connection
    
    def disconnect(self):
        """Simulate database disconnection."""
        if self.is_connected:
            print(f"Disconnecting from healthcare database: {self.db_name}")
            self.connection = None
            self.is_connected = False
    
    def begin_transaction(self):
        """Simulate beginning a transaction."""
        if not self.is_connected:
            raise Exception("Not connected to database")
        print("Beginning transaction")
        self.transaction = f"transaction_{datetime.now().timestamp()}"
    
    def commit_transaction(self):
        """Simulate committing a transaction."""
        if self.transaction:
            print("Committing transaction")
            self.transaction = None
    
    def rollback_transaction(self):
        """Simulate rolling back a transaction."""
        if self.transaction:
            print("Rolling back transaction")
            self.transaction = None
    
    def __enter__(self):
        """Context manager entry."""
        self.connect()
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit with proper cleanup."""
        if self.transaction:
            if exc_type:
                self.rollback_transaction()
            else:
                self.commit_transaction()
        
        self.disconnect()
        return False  # Don't suppress exceptions

@contextmanager
def database_transaction(db):
    """Context manager for database transactions."""
    db.begin_transaction()
    try:
        yield db
        db.commit_transaction()
    except Exception:
        db.rollback_transaction()
        raise

def simulate_patient_data_operations():
    """Simulate patient data operations with proper resource management."""
    
    print("Demonstrating Database Context Managers:")
    print("=" * 50)
    
    # Successful operation
    print("\n1. Successful Database Operation:")
    with HealthcareDatabase("patient_records") as db:
        with database_transaction(db):
            print("  Inserting patient record...")
            time.sleep(0.1)
            print("  Updating patient vitals...")
            time.sleep(0.1)
            print("  Operation completed successfully")
    
    # Operation with error (transaction rollback)
    print("\n2. Database Operation with Error:")
    try:
        with HealthcareDatabase("patient_records") as db:
            with database_transaction(db):
                print("  Inserting patient record...")
                time.sleep(0.1)
                print("  Simulating error...")
                raise Exception("Simulated database error")
    except Exception as e:
        print(f"  Caught exception: {e}")
    
    # Multiple operations
    print("\n3. Multiple Database Operations:")
    with HealthcareDatabase("clinical_data") as db:
        # First transaction
        with database_transaction(db):
            print("  Transaction 1: Adding lab results")
            time.sleep(0.05)
        
        # Second transaction
        with database_transaction(db):
            print("  Transaction 2: Updating patient diagnosis")
            time.sleep(0.05)
        
        print("  All operations completed")

# Run the database simulation
simulate_patient_data_operations()

## 6. Practice Exercise

Create a comprehensive healthcare data processing pipeline using context managers.

In [None]:
# Exercise: Healthcare Data ETL Pipeline with Context Managers
# Create a pipeline that:
# 1. Reads patient data from multiple sources (JSON, CSV)
# 2. Validates and transforms the data
# 3. Writes results to output files
# 4. Maintains audit logs
# 5. Handles errors gracefully
# 6. Cleans up resources properly

# Sample data for the exercise
exercise_data = {
    'patients.json': [
        {'id': 'PT100', 'name': 'Alice Johnson', 'age': 45, 'weight': 65.2, 'height': 1.68},
        {'id': 'PT101', 'name': 'Bob Wilson', 'age': 52, 'weight': 78.5, 'height': 1.75},
    ],
    'vitals.json': [
        {'patient_id': 'PT100', 'heart_rate': 72, 'blood_pressure': '120/80', 'temperature': 98.6},
        {'patient_id': 'PT101', 'heart_rate': 85, 'blood_pressure': '135/90', 'temperature': 99.1},
    ]
}

# TODO: Create context managers and functions for:
# - ETL pipeline management
# - Data validation and transformation
# - Result aggregation and reporting
# - Error handling and logging

# Your code here

## 7. Cleanup

In [None]:
# Clean up temporary files
import shutil

try:
    shutil.rmtree(temp_dir)
    print(f"Cleaned up temporary directory: {temp_dir}")
except Exception as e:
    print(f"Error cleaning up: {e}")

---

## Summary

In this session, you learned:
- ✅ Basic file operations with context managers
- ✅ Error-safe file reading and writing
- ✅ Custom context managers for healthcare workflows
- ✅ Batch file processing pipelines
- ✅ Resource management for database-like operations
- ✅ Proper cleanup and error handling
- ✅ Essential patterns for PySpark resource management

**Next:** Session 1.14 - Iterators and Generators