# Session 1.14: Iterators and Generators

## **Memory-Efficient Data Processing for PySpark**

### **Learning Objectives**
By the end of this session, you will:
- Understand iterators and the iterator protocol
- Create and use generators for memory-efficient processing
- Apply iterator patterns to healthcare data processing
- Build memory-efficient pipelines essential for PySpark

---

### **Relevance to PySpark**
Iterators and generators are fundamental to PySpark's lazy evaluation and memory-efficient processing of large datasets. Understanding these concepts is crucial for optimizing PySpark transformations.

---

## 1. Understanding Iterators

In [None]:
# Basic iterator concepts
patient_list = ['PT001', 'PT002', 'PT003', 'PT004']

# Creating an iterator from a list
patient_iter = iter(patient_list)

print("Manual iteration using next():")
print(f"First patient: {next(patient_iter)}")
print(f"Second patient: {next(patient_iter)}")
print(f"Third patient: {next(patient_iter)}")
print(f"Fourth patient: {next(patient_iter)}")

# Attempting to get next after exhaustion
try:
    print(f"Fifth patient: {next(patient_iter)}")
except StopIteration:
    print("Iterator exhausted - no more patients")

# Creating a custom iterator for patient records
class PatientRecordIterator:
    """Iterator for processing patient records one at a time."""
    
    def __init__(self, patient_data):
        self.patient_data = patient_data
        self.index = 0
    
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.index >= len(self.patient_data):
            raise StopIteration
        
        patient = self.patient_data[self.index]
        self.index += 1
        
        # Process patient data
        processed_patient = {
            'id': patient['id'],
            'name': patient['name'],
            'age': patient['age'],
            'bmi': round(patient['weight'] / (patient['height'] ** 2), 2),
            'status': 'processed'
        }
        
        return processed_patient

# Sample patient data
patients = [
    {'id': 'PT001', 'name': 'John Doe', 'age': 45, 'weight': 75.5, 'height': 1.75},
    {'id': 'PT002', 'name': 'Jane Smith', 'age': 32, 'weight': 62.3, 'height': 1.65},
    {'id': 'PT003', 'name': 'Bob Johnson', 'age': 58, 'weight': 88.2, 'height': 1.80},
]

print("\nUsing custom PatientRecordIterator:")
patient_iterator = PatientRecordIterator(patients)

for processed_patient in patient_iterator:
    print(f"Processed: {processed_patient}")

# Built-in functions that work with iterators
print("\nUsing built-in functions with iterators:")
ages = [p['age'] for p in patients]
age_iter = iter(ages)

print(f"Sum of ages: {sum(age_iter)}")

# Note: iterator is now exhausted
age_iter = iter(ages)  # Create new iterator
print(f"Max age: {max(age_iter)}")

age_iter = iter(ages)  # Create new iterator
print(f"Min age: {min(age_iter)}")

## 2. Introduction to Generators

In [None]:
def patient_bmi_generator(patient_records):
    """Generator function to calculate BMI for patients lazily."""
    for patient in patient_records:
        try:
            weight = patient['weight']
            height = patient['height']
            bmi = weight / (height ** 2)
            
            yield {
                'id': patient['id'],
                'name': patient['name'],
                'bmi': round(bmi, 2),
                'category': get_bmi_category(bmi)
            }
        except (KeyError, ZeroDivisionError, TypeError) as e:
            print(f"Error processing patient {patient.get('id', 'unknown')}: {e}")
            continue

def get_bmi_category(bmi):
    """Categorize BMI values."""
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal weight'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

# Using the generator
print("Using BMI Generator:")
bmi_gen = patient_bmi_generator(patients)

# Generators are lazy - values are computed on demand
for bmi_data in bmi_gen:
    print(f"Patient {bmi_data['id']}: BMI {bmi_data['bmi']} ({bmi_data['category']})")

# Generator expressions (like list comprehensions but lazy)
print("\nUsing generator expressions:")
high_bmi_patients = (p for p in patient_bmi_generator(patients) if p['bmi'] > 25)

print("Patients with high BMI:")
for patient in high_bmi_patients:
    print(f"  {patient['name']}: BMI {patient['bmi']}")

# Memory efficiency demonstration
def large_patient_generator(count):
    """Generate a large number of simulated patient records."""
    import random
    
    for i in range(count):
        yield {
            'id': f'PT{i+1:06d}',
            'name': f'Patient {i+1}',
            'age': random.randint(18, 90),
            'weight': round(random.uniform(50, 120), 1),
            'height': round(random.uniform(1.5, 2.0), 2)
        }

print("\nMemory-efficient processing of large dataset:")
large_dataset = large_patient_generator(10000)

# Process only the first 5 patients without loading all 10,000
count = 0
for patient in large_dataset:
    if count >= 5:
        break
    
    bmi = patient['weight'] / (patient['height'] ** 2)
    print(f"  {patient['id']}: Age {patient['age']}, BMI {bmi:.1f}")
    count += 1

print(f"Processed {count} patients without loading all 10,000 into memory")

## 3. Advanced Generator Patterns

In [None]:
def clinical_data_pipeline(*generators):
    """Chain multiple generators for data processing pipeline."""
    for generator in generators:
        for item in generator:
            yield item

def patient_demographics_generator(patients):
    """Generate patient demographic information."""
    for patient in patients:
        yield {
            'type': 'demographics',
            'patient_id': patient['id'],
            'data': {
                'name': patient['name'],
                'age': patient['age'],
                'age_group': get_age_group(patient['age'])
            }
        }

def patient_vitals_generator(patients):
    """Generate patient vital signs."""
    import random
    
    for patient in patients:
        # Simulate vital signs
        yield {
            'type': 'vitals',
            'patient_id': patient['id'],
            'data': {
                'heart_rate': random.randint(60, 100),
                'blood_pressure': f"{random.randint(110, 140)}/{random.randint(70, 90)}",
                'temperature': round(random.uniform(97.0, 99.5), 1)
            }
        }

def patient_lab_results_generator(patients):
    """Generate patient lab results."""
    import random
    
    lab_tests = ['Glucose', 'Cholesterol', 'Hemoglobin', 'White Blood Cells']
    
    for patient in patients:
        for test in lab_tests:
            yield {
                'type': 'lab_result',
                'patient_id': patient['id'],
                'data': {
                    'test_name': test,
                    'value': round(random.uniform(50, 200), 1),
                    'unit': get_test_unit(test),
                    'status': random.choice(['Normal', 'High', 'Low'])
                }
            }

def get_age_group(age):
    """Categorize patients by age group."""
    if age < 18:
        return 'Pediatric'
    elif age < 65:
        return 'Adult'
    else:
        return 'Senior'

def get_test_unit(test_name):
    """Get unit for lab test."""
    units = {
        'Glucose': 'mg/dL',
        'Cholesterol': 'mg/dL',
        'Hemoglobin': 'g/dL',
        'White Blood Cells': 'K/µL'
    }
    return units.get(test_name, 'units')

# Demonstrate generator chaining
print("Clinical Data Pipeline with Generator Chaining:")
print("=" * 55)

# Create generators for different data types
demographics_gen = patient_demographics_generator(patients[:2])  # Limit for demo
vitals_gen = patient_vitals_generator(patients[:2])
labs_gen = patient_lab_results_generator(patients[:1])  # Even fewer for lab results

# Chain generators together
clinical_pipeline = clinical_data_pipeline(demographics_gen, vitals_gen, labs_gen)

# Process chained data
data_counts = {'demographics': 0, 'vitals': 0, 'lab_result': 0}

for clinical_data in clinical_pipeline:
    data_type = clinical_data['type']
    patient_id = clinical_data['patient_id']
    data = clinical_data['data']
    
    data_counts[data_type] += 1
    
    print(f"[{data_type.upper()}] Patient {patient_id}: {data}")

print(f"\nData processed: {data_counts}")

## 4. Generator-Based Data Filtering and Transformation

In [None]:
def filter_patients_by_condition(patients, condition_func):
    """Filter patients based on a condition function."""
    for patient in patients:
        if condition_func(patient):
            yield patient

def transform_patient_data(patients, transform_func):
    """Transform patient data using a transformation function."""
    for patient in patients:
        yield transform_func(patient)

def aggregate_patient_data(patients, group_by_func, agg_func):
    """Aggregate patient data by groups."""
    from collections import defaultdict
    
    groups = defaultdict(list)
    
    # Group patients
    for patient in patients:
        group_key = group_by_func(patient)
        groups[group_key].append(patient)
    
    # Yield aggregated results
    for group_key, group_patients in groups.items():
        yield {
            'group': group_key,
            'count': len(group_patients),
            'aggregated_data': agg_func(group_patients)
        }

# Extended patient dataset for demonstrations
extended_patients = [
    {'id': 'PT001', 'name': 'John Doe', 'age': 45, 'weight': 75.5, 'height': 1.75, 'condition': 'Hypertension'},
    {'id': 'PT002', 'name': 'Jane Smith', 'age': 32, 'weight': 62.3, 'height': 1.65, 'condition': 'Diabetes'},
    {'id': 'PT003', 'name': 'Bob Johnson', 'age': 58, 'weight': 88.2, 'height': 1.80, 'condition': 'Hypertension'},
    {'id': 'PT004', 'name': 'Alice Brown', 'age': 28, 'weight': 55.8, 'height': 1.62, 'condition': 'Asthma'},
    {'id': 'PT005', 'name': 'Charlie Wilson', 'age': 67, 'weight': 82.1, 'height': 1.78, 'condition': 'Diabetes'},
]

print("Generator-Based Data Processing:")
print("=" * 40)

# Filter: Senior patients (age >= 65)
print("\n1. Senior Patients (age >= 65):")
senior_patients = filter_patients_by_condition(
    extended_patients, 
    lambda p: p['age'] >= 65
)

for patient in senior_patients:
    print(f"  {patient['name']}, {patient['age']} years old")

# Filter: High BMI patients
print("\n2. High BMI Patients (BMI > 25):")
high_bmi_patients = filter_patients_by_condition(
    extended_patients,
    lambda p: (p['weight'] / (p['height'] ** 2)) > 25
)

for patient in high_bmi_patients:
    bmi = patient['weight'] / (patient['height'] ** 2)
    print(f"  {patient['name']}: BMI {bmi:.1f}")

# Transform: Add calculated fields
print("\n3. Transform - Add BMI and Age Group:")
enhanced_patients = transform_patient_data(
    extended_patients,
    lambda p: {
        **p,
        'bmi': round(p['weight'] / (p['height'] ** 2), 1),
        'age_group': get_age_group(p['age']),
        'bmi_category': get_bmi_category(p['weight'] / (p['height'] ** 2))
    }
)

enhanced_list = list(enhanced_patients)  # Convert to list for reuse
for patient in enhanced_list[:3]:  # Show first 3
    print(f"  {patient['name']}: BMI {patient['bmi']} ({patient['bmi_category']}), {patient['age_group']}")

# Aggregate: Group by condition
print("\n4. Aggregate - Group by Medical Condition:")
condition_aggregates = aggregate_patient_data(
    enhanced_list,
    lambda p: p['condition'],
    lambda patients: {
        'avg_age': round(sum(p['age'] for p in patients) / len(patients), 1),
        'avg_bmi': round(sum(p['bmi'] for p in patients) / len(patients), 1),
        'patient_ids': [p['id'] for p in patients]
    }
)

for group in condition_aggregates:
    print(f"  {group['group']}: {group['count']} patients")
    print(f"    Avg Age: {group['aggregated_data']['avg_age']}")
    print(f"    Avg BMI: {group['aggregated_data']['avg_bmi']}")
    print(f"    Patients: {', '.join(group['aggregated_data']['patient_ids'])}")
    print()

## 5. Generator Pipelines for Large Dataset Processing

In [None]:
def data_validation_generator(patients):
    """Validate patient data and yield only valid records."""
    valid_count = 0
    error_count = 0
    
    for patient in patients:
        try:
            # Validate required fields
            required_fields = ['id', 'name', 'age', 'weight', 'height']
            for field in required_fields:
                if field not in patient or patient[field] is None:
                    raise ValueError(f"Missing required field: {field}")
            
            # Validate data ranges
            if not (0 <= patient['age'] <= 150):
                raise ValueError(f"Invalid age: {patient['age']}")
            
            if not (20 <= patient['weight'] <= 300):
                raise ValueError(f"Invalid weight: {patient['weight']}")
            
            if not (1.0 <= patient['height'] <= 2.5):
                raise ValueError(f"Invalid height: {patient['height']}")
            
            valid_count += 1
            yield patient
        
        except (ValueError, TypeError) as e:
            error_count += 1
            print(f"Validation error for patient {patient.get('id', 'unknown')}: {e}")
            continue
    
    print(f"\nValidation summary: {valid_count} valid, {error_count} errors")

def data_enrichment_generator(patients):
    """Enrich patient data with calculated fields."""
    for patient in patients:
        # Calculate BMI
        bmi = patient['weight'] / (patient['height'] ** 2)
        
        # Enrich patient data
        enriched_patient = {
            **patient,
            'bmi': round(bmi, 2),
            'bmi_category': get_bmi_category(bmi),
            'age_group': get_age_group(patient['age']),
            'processed_timestamp': '2024-01-15T10:30:00Z'  # Simulated timestamp
        }
        
        yield enriched_patient

def data_quality_scorer_generator(patients):
    """Score patient records based on data quality."""
    for patient in patients:
        quality_score = 0
        quality_issues = []
        
        # Check completeness
        if patient.get('name', '').strip():
            quality_score += 25
        else:
            quality_issues.append('Empty name')
        
        # Check age reasonableness
        age = patient.get('age', 0)
        if 18 <= age <= 100:
            quality_score += 25
        else:
            quality_issues.append(f'Unusual age: {age}')
        
        # Check BMI reasonableness
        bmi = patient.get('bmi', 0)
        if 15 <= bmi <= 50:
            quality_score += 25
        else:
            quality_issues.append(f'Unusual BMI: {bmi}')
        
        # Check data freshness (simulated)
        if patient.get('processed_timestamp'):
            quality_score += 25
        else:
            quality_issues.append('No processing timestamp')
        
        # Add quality information
        yield {
            **patient,
            'quality_score': quality_score,
            'quality_grade': get_quality_grade(quality_score),
            'quality_issues': quality_issues
        }

def get_quality_grade(score):
    """Convert quality score to letter grade."""
    if score >= 90:
        return 'A'
    elif score >= 80:
        return 'B'
    elif score >= 70:
        return 'C'
    elif score >= 60:
        return 'D'
    else:
        return 'F'

# Create a comprehensive data processing pipeline
def healthcare_data_processing_pipeline(raw_patient_data):
    """Complete data processing pipeline using generators."""
    
    # Step 1: Validate data
    validated_data = data_validation_generator(raw_patient_data)
    
    # Step 2: Enrich data
    enriched_data = data_enrichment_generator(validated_data)
    
    # Step 3: Score data quality
    quality_scored_data = data_quality_scorer_generator(enriched_data)
    
    return quality_scored_data

# Test data with some invalid records
test_patients = [
    {'id': 'PT001', 'name': 'John Doe', 'age': 45, 'weight': 75.5, 'height': 1.75},
    {'id': 'PT002', 'name': '', 'age': 32, 'weight': 62.3, 'height': 1.65},  # Empty name
    {'id': 'PT003', 'name': 'Bob Johnson', 'age': 158, 'weight': 88.2, 'height': 1.80},  # Invalid age
    {'id': 'PT004', 'name': 'Alice Brown', 'age': 28, 'weight': 55.8, 'height': 1.62},
    {'id': 'PT005', 'name': 'Charlie Wilson', 'age': 67, 'weight': 82.1},  # Missing height
]

print("Healthcare Data Processing Pipeline:")
print("=" * 45)

# Process data through pipeline
processed_pipeline = healthcare_data_processing_pipeline(test_patients)

print("\nProcessed Patient Data:")
processed_patients = list(processed_pipeline)

for patient in processed_patients:
    print(f"\nPatient {patient['id']} ({patient['name']})")
    print(f"  Age: {patient['age']}, BMI: {patient['bmi']} ({patient['bmi_category']})")
    print(f"  Quality Score: {patient['quality_score']}/100 (Grade: {patient['quality_grade']})")
    if patient['quality_issues']:
        print(f"  Issues: {', '.join(patient['quality_issues'])}")

# Pipeline statistics
quality_distribution = {}
for patient in processed_patients:
    grade = patient['quality_grade']
    quality_distribution[grade] = quality_distribution.get(grade, 0) + 1

print(f"\nQuality Distribution: {quality_distribution}")
avg_quality = sum(p['quality_score'] for p in processed_patients) / len(processed_patients)
print(f"Average Quality Score: {avg_quality:.1f}/100")

## 6. Memory Usage Comparison

In [None]:
import sys
import time

def memory_efficient_processing_demo():
    """Demonstrate memory efficiency of generators vs lists."""
    
    def create_large_patient_list(size):
        """Create large patient list (memory intensive)."""
        import random
        patients = []
        for i in range(size):
            patients.append({
                'id': f'PT{i+1:06d}',
                'name': f'Patient {i+1}',
                'age': random.randint(18, 90),
                'weight': round(random.uniform(50, 120), 1),
                'height': round(random.uniform(1.5, 2.0), 2)
            })
        return patients
    
    def create_large_patient_generator(size):
        """Create large patient generator (memory efficient)."""
        import random
        for i in range(size):
            yield {
                'id': f'PT{i+1:06d}',
                'name': f'Patient {i+1}',
                'age': random.randint(18, 90),
                'weight': round(random.uniform(50, 120), 1),
                'height': round(random.uniform(1.5, 2.0), 2)
            }
    
    dataset_size = 1000  # Reduced for demo purposes
    
    print("Memory Usage Comparison:")
    print("=" * 30)
    
    # List approach
    print(f"\n1. List Approach (loading {dataset_size:,} patients):")
    start_time = time.time()
    patient_list = create_large_patient_list(dataset_size)
    list_creation_time = time.time() - start_time
    list_size = sys.getsizeof(patient_list)
    
    print(f"   Creation time: {list_creation_time:.4f} seconds")
    print(f"   Memory usage: {list_size:,} bytes")
    
    # Process first 5 patients from list
    start_time = time.time()
    processed_count = 0
    for patient in patient_list:
        if processed_count >= 5:
            break
        bmi = patient['weight'] / (patient['height'] ** 2)
        processed_count += 1
    list_processing_time = time.time() - start_time
    
    print(f"   Processing time (first 5): {list_processing_time:.6f} seconds")
    
    # Generator approach
    print(f"\n2. Generator Approach (generating {dataset_size:,} patients):")
    start_time = time.time()
    patient_generator = create_large_patient_generator(dataset_size)
    generator_creation_time = time.time() - start_time
    generator_size = sys.getsizeof(patient_generator)
    
    print(f"   Creation time: {generator_creation_time:.6f} seconds")
    print(f"   Memory usage: {generator_size:,} bytes")
    
    # Process first 5 patients from generator
    start_time = time.time()
    processed_count = 0
    for patient in patient_generator:
        if processed_count >= 5:
            break
        bmi = patient['weight'] / (patient['height'] ** 2)
        processed_count += 1
    generator_processing_time = time.time() - start_time
    
    print(f"   Processing time (first 5): {generator_processing_time:.6f} seconds")
    
    # Comparison
    print(f"\n3. Comparison:")
    memory_savings = ((list_size - generator_size) / list_size) * 100
    print(f"   Memory savings: {memory_savings:.1f}%")
    print(f"   List memory / Generator memory: {list_size / generator_size:.1f}x")
    
    print(f"\n   Key insight: Generator uses constant memory regardless of dataset size!")
    print(f"   This is crucial for PySpark when processing large datasets.")

memory_efficient_processing_demo()

## 7. Practice Exercise

Build a memory-efficient clinical trial data processor using generators.

In [None]:
# Exercise: Clinical Trial Data Processing Pipeline
# Build a generator-based pipeline that:
# 1. Generates simulated clinical trial participant data
# 2. Filters participants based on inclusion criteria
# 3. Randomizes participants into treatment groups
# 4. Tracks and aggregates trial statistics
# 5. Generates progress reports without loading all data into memory

import random
from datetime import datetime, timedelta

# Sample inclusion criteria for a hypertension trial
inclusion_criteria = {
    'min_age': 18,
    'max_age': 75,
    'min_systolic_bp': 140,
    'max_systolic_bp': 180,
    'required_conditions': ['Hypertension']
}

# TODO: Create generators for:
# 1. participant_data_generator(count) - Generate participant records
# 2. eligibility_filter_generator(participants, criteria) - Filter eligible participants
# 3. randomization_generator(participants, groups) - Assign treatment groups
# 4. trial_statistics_generator(participants) - Generate running statistics

# Your code here

---

## Summary

In this session, you learned:
- ✅ Iterator protocol and custom iterators
- ✅ Generator functions and generator expressions
- ✅ Advanced generator patterns and chaining
- ✅ Memory-efficient data filtering and transformation
- ✅ Generator-based processing pipelines
- ✅ Memory usage optimization techniques
- ✅ Essential concepts for PySpark's lazy evaluation

**Next:** Session 1.15 - String Processing and Text Analysis